From f038a330c25038c71dd5a6f2017ee863d6285a28 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Fri, 19 Apr 2024 22:32:33 +0000 Subject: [PATCH 01/25] Implement configurable failure policy. --- api/jobset/v1alpha2/jobset_types.go | 54 ++- api/jobset/v1alpha2/openapi_generated.go | 81 ++++ api/jobset/v1alpha2/zz_generated.deepcopy.go | 34 +- .../jobset/v1alpha2/failurepolicy.go | 16 +- .../jobset/v1alpha2/failurepolicyrule.go | 61 +++ .../jobset/v1alpha2/jobsetstatus.go | 15 +- client-go/applyconfiguration/utils.go | 2 + .../crd/bases/jobset.x-k8s.io_jobsets.yaml | 47 +++ hack/python-sdk/swagger.json | 45 +++ pkg/controllers/failure_policy.go | 255 ++++++++++++ pkg/controllers/failure_policy_test.go | 279 ++++++++++++++ pkg/controllers/jobset_controller.go | 70 ++-- pkg/controllers/jobset_controller_test.go | 34 +- pkg/webhooks/jobset_webhook.go | 30 ++ pkg/webhooks/jobset_webhook_test.go | 68 ++++ sdk/python/README.md | 1 + .../docs/JobsetV1alpha2FailurePolicy.md | 1 + .../docs/JobsetV1alpha2FailurePolicyRule.md | 13 + sdk/python/docs/JobsetV1alpha2JobSetStatus.md | 1 + sdk/python/jobset/__init__.py | 1 + sdk/python/jobset/models/__init__.py | 1 + .../models/jobset_v1alpha2_failure_policy.py | 34 +- .../jobset_v1alpha2_failure_policy_rule.py | 180 +++++++++ .../models/jobset_v1alpha2_job_set_status.py | 34 +- .../test_jobset_v1alpha2_failure_policy.py | 12 +- ...est_jobset_v1alpha2_failure_policy_rule.py | 64 ++++ .../test/test_jobset_v1alpha2_job_set.py | 20 +- .../test/test_jobset_v1alpha2_job_set_list.py | 40 +- .../test/test_jobset_v1alpha2_job_set_spec.py | 12 +- .../test_jobset_v1alpha2_job_set_status.py | 3 +- .../controller/jobset_controller_test.go | 362 +++++++++++++++++- 31 files changed, 1785 insertions(+), 85 deletions(-) create mode 100644 client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go create mode 100644 pkg/controllers/failure_policy.go create mode 100644 pkg/controllers/failure_policy_test.go create mode 100644 sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md create mode 100644 sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py create mode 100644 sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py diff --git a/api/jobset/v1alpha2/jobset_types.go b/api/jobset/v1alpha2/jobset_types.go index 2a1d39af..54733cb1 100644 --- a/api/jobset/v1alpha2/jobset_types.go +++ b/api/jobset/v1alpha2/jobset_types.go @@ -22,10 +22,11 @@ import ( const ( JobSetNameKey string = "jobset.sigs.k8s.io/jobset-name" ReplicatedJobReplicas string = "jobset.sigs.k8s.io/replicatedjob-replicas" - ReplicatedJobNameKey string = "jobset.sigs.k8s.io/replicatedjob-name" - JobIndexKey string = "jobset.sigs.k8s.io/job-index" - JobKey string = "jobset.sigs.k8s.io/job-key" - JobNameKey string = "job-name" // TODO(#26): Migrate to the fully qualified label name. + // ReplicatedJobNameKey is used to index into a Jobs labels and retrieve the name of the parent ReplicatedJob + ReplicatedJobNameKey string = "jobset.sigs.k8s.io/replicatedjob-name" + JobIndexKey string = "jobset.sigs.k8s.io/job-index" + JobKey string = "jobset.sigs.k8s.io/job-key" + JobNameKey string = "job-name" // TODO(#26): Migrate to the fully qualified label name. // ExclusiveKey is an annotation that can be set on the JobSet or on a ReplicatedJob template. // If set at the JobSet level, all child jobs from all ReplicatedJobs will be scheduled using exclusive // job placement per topology group (defined as the label value). @@ -130,6 +131,9 @@ type JobSetStatus struct { // Restarts tracks the number of times the JobSet has restarted (i.e. recreated in case of RecreateAll policy). Restarts int32 `json:"restarts,omitempty"` + // RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts. + RestartsCountTowardsMax int32 `json:"restartsCountTowardsMax,omitempty"` + // ReplicatedJobsStatus track the number of JobsReady for each replicatedJob. // +optional // +listType=map @@ -229,10 +233,52 @@ const ( OperatorAny Operator = "Any" ) +// FailurePolicyAction defines the action the JobSet controller will take for +// a given FailurePolicyRule. +type FailurePolicyAction string + +const ( + // Fail the JobSet immediately, regardless of maxRestarts. + FailJobSet FailurePolicyAction = "FailJobSet" + + // Restart the JobSet if the number of restart attempts is less than MaxRestarts. + // Otherwise, fail the JobSet. + RestartJobSet FailurePolicyAction = "RestartJobSet" + + // Do not count the failure against maxRestarts. + RestartJobSetAndIgnoreMaxRestarts FailurePolicyAction = "RestartJobSetAndIgnoreMaxRestarts" +) + +// FailurePolicyRule defines a FailurePolicyAction to be executed if a child job +// fails due to a reason listed in OnJobFailureReasons. +type FailurePolicyRule struct { + // The action to take if the rule is matched. + // +kubebuilder:validation:Enum:=FailJobSet;RestartJobSet;RestartJobSetAndIgnoreMaxRestarts + Action FailurePolicyAction `json:"action"` + // The requirement on the job failure reasons. The requirement + // is satisfied if at least one reason matches the list. + // The rules are evaluated in order, and the first matching + // rule is executed. + // An empty list applies the rule to any job failure reason. + // +kubebuilder:validation:UniqueItems:true + OnJobFailureReasons []string `json:"onJobFailureReasons"` + // TargetReplicatedJobs are the names of the replicated jobs the operator applies to. + // An empty list will apply to all replicatedJobs. + // +optional + // +listType=atomic + TargetReplicatedJobs []string `json:"targetReplicatedJobs,omitempty"` +} + type FailurePolicy struct { // MaxRestarts defines the limit on the number of JobSet restarts. // A restart is achieved by recreating all active child jobs. MaxRestarts int32 `json:"maxRestarts,omitempty"` + + // List of failure policy rules for this JobSet. + // For a given Job failure, the rules will be evaluated in order, + // and only the first matching rule will be executed. + // If no matching rule is found, the RestartJobSet action is applied. + Rules []FailurePolicyRule `json:"rules,omitempty"` } type SuccessPolicy struct { diff --git a/api/jobset/v1alpha2/openapi_generated.go b/api/jobset/v1alpha2/openapi_generated.go index 6a63c96d..e4953b34 100644 --- a/api/jobset/v1alpha2/openapi_generated.go +++ b/api/jobset/v1alpha2/openapi_generated.go @@ -25,6 +25,7 @@ import ( func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenAPIDefinition { return map[string]common.OpenAPIDefinition{ "sigs.k8s.io/jobset/api/jobset/v1alpha2.FailurePolicy": schema_jobset_api_jobset_v1alpha2_FailurePolicy(ref), + "sigs.k8s.io/jobset/api/jobset/v1alpha2.FailurePolicyRule": schema_jobset_api_jobset_v1alpha2_FailurePolicyRule(ref), "sigs.k8s.io/jobset/api/jobset/v1alpha2.JobSet": schema_jobset_api_jobset_v1alpha2_JobSet(ref), "sigs.k8s.io/jobset/api/jobset/v1alpha2.JobSetList": schema_jobset_api_jobset_v1alpha2_JobSetList(ref), "sigs.k8s.io/jobset/api/jobset/v1alpha2.JobSetSpec": schema_jobset_api_jobset_v1alpha2_JobSetSpec(ref), @@ -50,7 +51,80 @@ func schema_jobset_api_jobset_v1alpha2_FailurePolicy(ref common.ReferenceCallbac Format: "int32", }, }, + "rules": { + SchemaProps: spec.SchemaProps{ + Description: "List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/jobset/api/jobset/v1alpha2.FailurePolicyRule"), + }, + }, + }, + }, + }, + }, + }, + }, + Dependencies: []string{ + "sigs.k8s.io/jobset/api/jobset/v1alpha2.FailurePolicyRule"}, + } +} + +func schema_jobset_api_jobset_v1alpha2_FailurePolicyRule(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fails due to a reason listed in OnJobFailureReasons.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "action": { + SchemaProps: spec.SchemaProps{ + Description: "The action to take if the rule is matched.", + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, + "onJobFailureReasons": { + SchemaProps: spec.SchemaProps{ + Description: "The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + "targetReplicatedJobs": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, }, + Required: []string{"action", "onJobFailureReasons"}, }, }, } @@ -269,6 +343,13 @@ func schema_jobset_api_jobset_v1alpha2_JobSetStatus(ref common.ReferenceCallback Format: "int32", }, }, + "restartsCountTowardsMax": { + SchemaProps: spec.SchemaProps{ + Description: "RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts.", + Type: []string{"integer"}, + Format: "int32", + }, + }, "replicatedJobsStatus": { VendorExtensible: spec.VendorExtensible{ Extensions: spec.Extensions{ diff --git a/api/jobset/v1alpha2/zz_generated.deepcopy.go b/api/jobset/v1alpha2/zz_generated.deepcopy.go index 7bc6f25b..3262f2a9 100644 --- a/api/jobset/v1alpha2/zz_generated.deepcopy.go +++ b/api/jobset/v1alpha2/zz_generated.deepcopy.go @@ -25,6 +25,13 @@ import ( // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FailurePolicy) DeepCopyInto(out *FailurePolicy) { *out = *in + if in.Rules != nil { + in, out := &in.Rules, &out.Rules + *out = make([]FailurePolicyRule, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailurePolicy. @@ -37,6 +44,31 @@ func (in *FailurePolicy) DeepCopy() *FailurePolicy { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FailurePolicyRule) DeepCopyInto(out *FailurePolicyRule) { + *out = *in + if in.OnJobFailureReasons != nil { + in, out := &in.OnJobFailureReasons, &out.OnJobFailureReasons + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.TargetReplicatedJobs != nil { + in, out := &in.TargetReplicatedJobs, &out.TargetReplicatedJobs + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailurePolicyRule. +func (in *FailurePolicyRule) DeepCopy() *FailurePolicyRule { + if in == nil { + return nil + } + out := new(FailurePolicyRule) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *JobSet) DeepCopyInto(out *JobSet) { *out = *in @@ -119,7 +151,7 @@ func (in *JobSetSpec) DeepCopyInto(out *JobSetSpec) { if in.FailurePolicy != nil { in, out := &in.FailurePolicy, &out.FailurePolicy *out = new(FailurePolicy) - **out = **in + (*in).DeepCopyInto(*out) } if in.StartupPolicy != nil { in, out := &in.StartupPolicy, &out.StartupPolicy diff --git a/client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go index 9d80d448..82b6756e 100644 --- a/client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go +++ b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go @@ -17,7 +17,8 @@ package v1alpha2 // FailurePolicyApplyConfiguration represents an declarative configuration of the FailurePolicy type for use // with apply. type FailurePolicyApplyConfiguration struct { - MaxRestarts *int32 `json:"maxRestarts,omitempty"` + MaxRestarts *int32 `json:"maxRestarts,omitempty"` + Rules []FailurePolicyRuleApplyConfiguration `json:"rules,omitempty"` } // FailurePolicyApplyConfiguration constructs an declarative configuration of the FailurePolicy type for use with @@ -33,3 +34,16 @@ func (b *FailurePolicyApplyConfiguration) WithMaxRestarts(value int32) *FailureP b.MaxRestarts = &value return b } + +// WithRules adds the given value to the Rules field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Rules field. +func (b *FailurePolicyApplyConfiguration) WithRules(values ...*FailurePolicyRuleApplyConfiguration) *FailurePolicyApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithRules") + } + b.Rules = append(b.Rules, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go new file mode 100644 index 00000000..9bec63bf --- /dev/null +++ b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go @@ -0,0 +1,61 @@ +/* +Copyright 2023 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + v1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// FailurePolicyRuleApplyConfiguration represents an declarative configuration of the FailurePolicyRule type for use +// with apply. +type FailurePolicyRuleApplyConfiguration struct { + Action *v1alpha2.FailurePolicyAction `json:"action,omitempty"` + OnJobFailureReasons []string `json:"onJobFailureReasons,omitempty"` + TargetReplicatedJobs []string `json:"targetReplicatedJobs,omitempty"` +} + +// FailurePolicyRuleApplyConfiguration constructs an declarative configuration of the FailurePolicyRule type for use with +// apply. +func FailurePolicyRule() *FailurePolicyRuleApplyConfiguration { + return &FailurePolicyRuleApplyConfiguration{} +} + +// WithAction sets the Action field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Action field is set to the value of the last call. +func (b *FailurePolicyRuleApplyConfiguration) WithAction(value v1alpha2.FailurePolicyAction) *FailurePolicyRuleApplyConfiguration { + b.Action = &value + return b +} + +// WithOnJobFailureReasons adds the given value to the OnJobFailureReasons field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the OnJobFailureReasons field. +func (b *FailurePolicyRuleApplyConfiguration) WithOnJobFailureReasons(values ...string) *FailurePolicyRuleApplyConfiguration { + for i := range values { + b.OnJobFailureReasons = append(b.OnJobFailureReasons, values[i]) + } + return b +} + +// WithTargetReplicatedJobs adds the given value to the TargetReplicatedJobs field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the TargetReplicatedJobs field. +func (b *FailurePolicyRuleApplyConfiguration) WithTargetReplicatedJobs(values ...string) *FailurePolicyRuleApplyConfiguration { + for i := range values { + b.TargetReplicatedJobs = append(b.TargetReplicatedJobs, values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/jobset/v1alpha2/jobsetstatus.go b/client-go/applyconfiguration/jobset/v1alpha2/jobsetstatus.go index 3b18fb01..f31e5d99 100644 --- a/client-go/applyconfiguration/jobset/v1alpha2/jobsetstatus.go +++ b/client-go/applyconfiguration/jobset/v1alpha2/jobsetstatus.go @@ -21,9 +21,10 @@ import ( // JobSetStatusApplyConfiguration represents an declarative configuration of the JobSetStatus type for use // with apply. type JobSetStatusApplyConfiguration struct { - Conditions []v1.Condition `json:"conditions,omitempty"` - Restarts *int32 `json:"restarts,omitempty"` - ReplicatedJobsStatus []ReplicatedJobStatusApplyConfiguration `json:"replicatedJobsStatus,omitempty"` + Conditions []v1.Condition `json:"conditions,omitempty"` + Restarts *int32 `json:"restarts,omitempty"` + RestartsCountTowardsMax *int32 `json:"restartsCountTowardsMax,omitempty"` + ReplicatedJobsStatus []ReplicatedJobStatusApplyConfiguration `json:"replicatedJobsStatus,omitempty"` } // JobSetStatusApplyConfiguration constructs an declarative configuration of the JobSetStatus type for use with @@ -50,6 +51,14 @@ func (b *JobSetStatusApplyConfiguration) WithRestarts(value int32) *JobSetStatus return b } +// WithRestartsCountTowardsMax sets the RestartsCountTowardsMax field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the RestartsCountTowardsMax field is set to the value of the last call. +func (b *JobSetStatusApplyConfiguration) WithRestartsCountTowardsMax(value int32) *JobSetStatusApplyConfiguration { + b.RestartsCountTowardsMax = &value + return b +} + // WithReplicatedJobsStatus adds the given value to the ReplicatedJobsStatus field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, values provided by each call will be appended to the ReplicatedJobsStatus field. diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index c79d195f..99de9197 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -27,6 +27,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { // Group=jobset.x-k8s.io, Version=v1alpha2 case v1alpha2.SchemeGroupVersion.WithKind("FailurePolicy"): return &jobsetv1alpha2.FailurePolicyApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("FailurePolicyRule"): + return &jobsetv1alpha2.FailurePolicyRuleApplyConfiguration{} case v1alpha2.SchemeGroupVersion.WithKind("JobSet"): return &jobsetv1alpha2.JobSetApplyConfiguration{} case v1alpha2.SchemeGroupVersion.WithKind("JobSetSpec"): diff --git a/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml b/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml index ee4e248e..c9bc1b30 100644 --- a/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml +++ b/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml @@ -68,6 +68,47 @@ spec: A restart is achieved by recreating all active child jobs. format: int32 type: integer + rules: + description: |- + List of failure policy rules for this JobSet. + For a given Job failure, the rules will be evaluated in order, + and only the first matching rule will be executed. + If no matching rule is found, the RestartJobSet action is applied. + items: + description: |- + FailurePolicyRule defines a FailurePolicyAction to be executed if a child job + fails due to a reason listed in OnJobFailureReasons. + properties: + action: + description: The action to take if the rule is matched. + enum: + - FailJobSet + - RestartJobSet + - RestartJobSetAndIgnoreMaxRestarts + type: string + onJobFailureReasons: + description: |- + The requirement on the job failure reasons. The requirement + is satisfied if at least one reason matches the list. + The rules are evaluated in order, and the first matching + rule is executed. + An empty list applies the rule to any job failure reason. + items: + type: string + type: array + targetReplicatedJobs: + description: |- + TargetReplicatedJobs are the names of the replicated jobs the operator applies to. + An empty list will apply to all replicatedJobs. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - action + - onJobFailureReasons + type: object + type: array type: object x-kubernetes-validations: - message: Value is immutable @@ -8493,6 +8534,12 @@ spec: (i.e. recreated in case of RecreateAll policy). format: int32 type: integer + restartsCountTowardsMax: + description: RestartsCountTowardsMax tracks the number of times the + JobSet has restarted that counts towards the maximum allowed number + of restarts. + format: int32 + type: integer type: object type: object served: true diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 4f1a3f6c..40d945ff 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -14,6 +14,46 @@ "description": "MaxRestarts defines the limit on the number of JobSet restarts. A restart is achieved by recreating all active child jobs.", "type": "integer", "format": "int32" + }, + "rules": { + "description": "List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied.", + "type": "array", + "items": { + "default": {}, + "$ref": "#/definitions/jobset.v1alpha2.FailurePolicyRule" + } + } + } + }, + "jobset.v1alpha2.FailurePolicyRule": { + "description": "FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fails due to a reason listed in OnJobFailureReasons.", + "type": "object", + "required": [ + "action", + "onJobFailureReasons" + ], + "properties": { + "action": { + "description": "The action to take if the rule is matched.", + "type": "string", + "default": "" + }, + "onJobFailureReasons": { + "description": "The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason.", + "type": "array", + "items": { + "type": "string", + "default": "" + } + }, + "targetReplicatedJobs": { + "description": "TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs.", + "type": "array", + "items": { + "type": "string", + "default": "" + }, + "x-kubernetes-list-type": "atomic" } } }, @@ -149,6 +189,11 @@ "description": "Restarts tracks the number of times the JobSet has restarted (i.e. recreated in case of RecreateAll policy).", "type": "integer", "format": "int32" + }, + "restartsCountTowardsMax": { + "description": "RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts.", + "type": "integer", + "format": "int32" } } }, diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go new file mode 100644 index 00000000..4c2162d5 --- /dev/null +++ b/pkg/controllers/failure_policy.go @@ -0,0 +1,255 @@ +/* +Copyright 2023 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "slices" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + + jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" + "sigs.k8s.io/jobset/pkg/constants" +) + +// actionFunctionMap relates jobset failure policy action names to the appropriate behavior during jobset reconciliation. +var actionFunctionMap = map[jobset.FailurePolicyAction]failurePolicyActionApplier{ + jobset.FailJobSet: failJobSetActionApplier, + jobset.RestartJobSet: restartJobSetActionApplier, + jobset.RestartJobSetAndIgnoreMaxRestarts: restartJobSetAndIgnoreMaxRestartsActionApplier, +} + +// executeFailurePolicy applies the Failure Policy of a JobSet when a failed child Job is found. +// This function is run only when a failed child job has already been found. +func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs, updateStatusOpts *statusUpdateOpts) error { + log := ctrl.LoggerFrom(ctx) + + // If no failure policy is defined, mark the JobSet as failed. + if js.Spec.FailurePolicy == nil { + // firstFailedJob is only computed if necessary since it is expensive to compute + // for JobSets with many child jobs. This is why we don't unconditionally compute + // it once at the beginning of the function and share the results between the different + // possible code paths here. + firstFailedJob := findFirstFailedJob(ownedJobs.failed) + setJobSetFailedCondition(ctx, js, constants.FailedJobsReason, messageWithFirstFailedJob(constants.FailedJobsMessage, firstFailedJob.Name), updateStatusOpts) + return nil + } + + // Check for matching Failure Policy Rule + failurePolicyRule, matchingFailedJob := findFirstFailedPolicyRuleAndJob(ctx, js, ownedJobs.failed) + + failurePolicyRuleAction := jobset.RestartJobSet + if failurePolicyRule != nil { + failurePolicyRuleAction = failurePolicyRule.Action + } + + if err := applyFailurePolicyRuleAction(ctx, js, matchingFailedJob, updateStatusOpts, failurePolicyRuleAction); err != nil { + log.Error(err, "applying FailurePolicyRuleAction %v", failurePolicyRuleAction) + return err + } + + return nil +} + +// If the failure policy rule is not nil, then the functions returns: +// - true if the rule is applicable to the jobFailureReason; +// - false other. +// If the rule is nil, we return true. +func ruleIsApplicable(rule *jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { + if rule == nil { + return true + } + + ruleAppliesToJobFailureReason := len(rule.OnJobFailureReasons) == 0 || slices.Contains(rule.OnJobFailureReasons, jobFailureReason) + if !ruleAppliesToJobFailureReason { + return false + } + + parentReplicatedJob, exists := parentReplicatedJobName(failedJob) + if !exists { + // If we cannot find the parent ReplicatedJob, we assume the rule does not apply. + // TODO: Add a log statement that the failedJob does not appear to have a parent replicated job. + // This error should not happen, but was a pain to debug when adding unit tests. + return false + } + + ruleAppliesToParentReplicatedJob := len(rule.TargetReplicatedJobs) == 0 || slices.Contains(rule.TargetReplicatedJobs, parentReplicatedJob) + return ruleAppliesToParentReplicatedJob +} + +// The function findFirstFailedPolicyRuleAndJob returns the first failure policy rule matching a failed child job. +// The function also returns the first child job matching the failure policy rule returned. +// If there does not exist a matching failure policy rule, the function returns nil for the failure policy rule +// accompanied with the first failing job. This function assumes that the jobset has a non nil failure policy. +func findFirstFailedPolicyRuleAndJob(ctx context.Context, js *jobset.JobSet, failedOwnedJobs []*batchv1.Job) (*jobset.FailurePolicyRule, *batchv1.Job) { + log := ctrl.LoggerFrom(ctx) + + rulesExist := len(js.Spec.FailurePolicy.Rules) > 0 + if !rulesExist { + firstFailedJob := findFirstFailedJob(failedOwnedJobs) + return nil, firstFailedJob + } + + // These variables are only to make the ensuing lines of code shorter. + rules := js.Spec.FailurePolicy.Rules + numRules := len(js.Spec.FailurePolicy.Rules) + + // bucket[i] corresponds to js.Spec.FailurePolicy.Rules[i] + type bucket struct { + firstFailedJob *batchv1.Job + firstFailureTime *metav1.Time + } + + // We make a bucket for each rule and then an extra bucket to represent the default rule + numBuckets := numRules + 1 + defaultRuleIndex := numRules + var buckets = make([]bucket, numRules+1) + + for _, failedJob := range failedOwnedJobs { + jobFailureCondition := findJobFailureCondition(failedJob) + // This means that the Job has not failed. + if jobFailureCondition == nil { + continue + } + + jobFailureTime, jobFailureReason := ptr.To(jobFailureCondition.LastTransitionTime), jobFailureCondition.Reason + for i := 0; i < numBuckets; i++ { + // We use nil to represent the default rule that + // applies to all job failure reasons. + var rule *jobset.FailurePolicyRule + if i < numRules { + rule = ptr.To(rules[i]) + } + if ruleIsApplicable(rule, failedJob, jobFailureReason) && (buckets[i].firstFailureTime == nil || jobFailureTime.Before(buckets[i].firstFailureTime)) { + buckets[i].firstFailedJob = failedJob + buckets[i].firstFailureTime = jobFailureTime + } + } + } + + // Checking if any failure policy rules were matched + // and returning the rule along with the first + // failed job to match it. + for i := 0; i < numRules; i++ { + if buckets[i].firstFailedJob != nil { + return &rules[i], buckets[i].firstFailedJob + } + } + + // We get here when no rule matched any of the failure policy rules + log.V(2).Info("never found a matching rule and returning nil to represent the default rule.") + return nil, buckets[defaultRuleIndex].firstFailedJob +} + +// failurePolicyRecreateAll triggers a JobSet restart for the next reconcillation loop. +func failurePolicyRecreateAll(ctx context.Context, js *jobset.JobSet, shouldCountTowardsMax bool, updateStatusOpts *statusUpdateOpts, event *eventParams) { + log := ctrl.LoggerFrom(ctx) + + if updateStatusOpts == nil { + updateStatusOpts = &statusUpdateOpts{} + } + + // Increment JobSet restarts. This will trigger reconciliation and result in deletions + // of old jobs not part of the current jobSet run. + js.Status.Restarts += 1 + + if shouldCountTowardsMax { + js.Status.RestartsCountTowardsMax += 1 + } + + updateStatusOpts.shouldUpdate = true + + // Emit event for each JobSet restarts for observability and debugability. + enqueueEvent(updateStatusOpts, event) + log.V(2).Info("attempting restart", "restart attempt", js.Status.Restarts) +} + +// parentReplicatedJobName returns the name of the parent +// ReplicatedJob and true if it is able to retrieve the parent. +// The empty string and false are returned otherwise. +func parentReplicatedJobName(job *batchv1.Job) (string, bool) { + if job == nil { + return "", false + } + + replicatedJobName, ok := job.Labels[jobset.ReplicatedJobNameKey] + replicatedJobNameIsUnset := !ok || replicatedJobName == "" + return replicatedJobName, !replicatedJobNameIsUnset +} + +// The type failurePolicyActionApplier applies a FailurePolicyAction and returns nil if the FailurePolicyAction was successfully applied. +// The function returns an error otherwise. +type failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error + +// failJobSetActionApplier applies the FailJobSet FailurePolicyAction +var failJobSetActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { + failureMessage := messageWithFirstFailedJob(constants.ReachedMaxRestartsMessage, matchingFailedJob.Name) + setJobSetFailedCondition(ctx, js, constants.ReachedMaxRestartsReason, failureMessage, updateStatusOpts) + return nil +} + +// restartJobSetActionApplier applies the RestartJobSet FailurePolicyAction +var restartJobSetActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { + if js.Status.RestartsCountTowardsMax >= js.Spec.FailurePolicy.MaxRestarts { + failureMessage := messageWithFirstFailedJob(constants.ReachedMaxRestartsMessage, matchingFailedJob.Name) + setJobSetFailedCondition(ctx, js, constants.ReachedMaxRestartsReason, failureMessage, updateStatusOpts) + return nil + } + + shouldCountTowardsMax := true + event := &eventParams{ + object: js, + eventType: corev1.EventTypeWarning, + eventReason: fmt.Sprintf("restarting jobset, attempt %d", js.Status.Restarts), + } + failurePolicyRecreateAll(ctx, js, shouldCountTowardsMax, updateStatusOpts, event) + return nil +} + +// restartJobSetAndIgnoreMaxRestartsActionApplier applies the RestartJobSetAndIgnoreMaxRestarts FailurePolicyAction +var restartJobSetAndIgnoreMaxRestartsActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { + shouldCountTowardsMax := false + event := &eventParams{ + object: js, + eventType: corev1.EventTypeWarning, + eventReason: fmt.Sprintf("restarting jobset without counting towards maximum restarts, attempt %d", js.Status.Restarts), + } + failurePolicyRecreateAll(ctx, js, shouldCountTowardsMax, updateStatusOpts, event) + return nil +} + +// applyFailurePolicyRuleAction applies the supplied FailurePolicyRuleAction. +func applyFailurePolicyRuleAction(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOps *statusUpdateOpts, failurePolicyRuleAction jobset.FailurePolicyAction) error { + log := ctrl.LoggerFrom(ctx) + + applier, ok := actionFunctionMap[failurePolicyRuleAction] + if !ok { + err := fmt.Errorf("failed to find a corresponding action for the FailurePolicyRuleAction %v", failurePolicyRuleAction) + log.Error(err, "retrieving information for FailurePolicyRuleAction") + return err + } + + err := applier(ctx, js, matchingFailedJob, updateStatusOps) + if err != nil { + log.Error(err, "error applying the FailurePolicyRuleAction: %v", failurePolicyRuleAction) + return err + } + + return nil +} diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go new file mode 100644 index 00000000..d6bb0699 --- /dev/null +++ b/pkg/controllers/failure_policy_test.go @@ -0,0 +1,279 @@ +/* +Copyright 2023 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + batchv1 "k8s.io/api/batch/v1" + "k8s.io/utils/ptr" + + jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" + testutils "sigs.k8s.io/jobset/pkg/util/testing" +) + +func TestFailurePolicyRuleIsApplicable(t *testing.T) { + var ( + replicatedJobName1 = "test-replicated-job-1" + replicatedJobName2 = "test-replicated-job-2" + jobName = "test-job" + ns = "default" + ) + + tests := []struct { + name string + rule *jobset.FailurePolicyRule + failedJob *batchv1.Job + jobFailureReason string + expected bool + }{ + { + name: "failure policy rule is nil", + expected: true, + }, + { + name: "failure policy rule matches on job failure reason", + rule: &jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + TargetReplicatedJobs: []string{replicatedJobName1}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, + expected: true, + }, + { + name: "failure policy rule matches all on job failure reason", + rule: &jobset.FailurePolicyRule{ + TargetReplicatedJobs: []string{replicatedJobName1}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: batchv1.JobReasonMaxFailedIndexesExceeded, + expected: true, + }, + { + name: "failure policy rule does not match on job failure reason", + rule: &jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + TargetReplicatedJobs: []string{replicatedJobName1}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: batchv1.JobReasonDeadlineExceeded, + expected: false, + }, + { + name: "failure policy rule is not applicable to parent replicatedJob of failed job", + rule: &jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + TargetReplicatedJobs: []string{replicatedJobName1}, + }, + jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName2}, + ).Obj(), + expected: false, + }, + { + name: "failure policy rule is applicable to all replicatedjobs when targetedReplicatedJobs is omitted", + rule: &jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, + expected: true, + }, + { + name: "failure policy rule is applicable to parent replicatedJob when targetedReplicatedJobs is specified", + rule: &jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + TargetReplicatedJobs: []string{replicatedJobName1}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + actual := ruleIsApplicable(tc.rule, tc.failedJob, tc.jobFailureReason) + if diff := cmp.Diff(tc.expected, actual); diff != "" { + t.Errorf("unexpected finished value (+got/-want): %s", diff) + } + }) + } +} + +func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { + var ( + replicatedJobName = "test-replicatedJob" + jobSetName = "test-jobset" + ns = "default" + + failedJobNoReason1 = jobWithFailedCondition("job1", time.Now().Add(-6*time.Hour)) + failedJobNoReason2 = jobWithFailedCondition("job2", time.Now().Add(-3*time.Hour)) + failedJobNoReason3 = jobWithFailedCondition("job2", time.Now().Add(-1*time.Hour)) + + failedJob1 = jobWithFailedConditionAndOpts("job1", time.Now().Add(-6*time.Hour), + &failJobOptions{ + reason: ptr.To(batchv1.JobReasonBackoffLimitExceeded), + parentReplicatedJobName: ptr.To(replicatedJobName), + }, + ) + failedJob2 = jobWithFailedConditionAndOpts("job2", time.Now().Add(-3*time.Hour), + &failJobOptions{ + reason: ptr.To(batchv1.JobReasonDeadlineExceeded), + parentReplicatedJobName: ptr.To(replicatedJobName), + }, + ) + failedJob3 = jobWithFailedConditionAndOpts("job3", time.Now().Add(-1*time.Hour), + &failJobOptions{ + reason: ptr.To(batchv1.JobReasonFailedIndexes), + parentReplicatedJobName: ptr.To(replicatedJobName), + }, + ) + + // ruleN matches failedJobN + rule1 = jobset.FailurePolicyRule{ + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + } + rule2 = jobset.FailurePolicyRule{ + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonDeadlineExceeded}, + } + + unmatchedRule = jobset.FailurePolicyRule{ + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded, batchv1.JobReasonPodFailurePolicy}, + } + + extraFailedJob = jobWithFailedConditionAndOpts("extra-job1", time.Now().Add(3*time.Hour), + &failJobOptions{ + reason: ptr.To(batchv1.JobReasonDeadlineExceeded), + parentReplicatedJobName: ptr.To(replicatedJobName), + }, + ) + ) + tests := []struct { + name string + js *jobset.JobSet + failedOwnedJobs []*batchv1.Job + + expectedFailurePolicyRule *jobset.FailurePolicyRule + expectedJob *batchv1.Job + }{ + { + name: "failure policy rules are empty with no failed jobs", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), + failedOwnedJobs: []*batchv1.Job{}, + + expectedFailurePolicyRule: nil, + expectedJob: nil, + }, + { + name: "failure policy rules are empty with one failed job", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), + failedOwnedJobs: []*batchv1.Job{ + failedJobNoReason1, + }, + + expectedFailurePolicyRule: nil, + expectedJob: failedJobNoReason1, + }, + { + name: "failure policy rules are empty with multiple failed jobs", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), + failedOwnedJobs: []*batchv1.Job{failedJobNoReason3, failedJobNoReason1, failedJobNoReason2}, + + expectedFailurePolicyRule: nil, + expectedJob: failedJobNoReason1, + }, + { + name: "failure policy rule does not match on job failure reasons", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{unmatchedRule}, + }).Obj(), + failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + + expectedFailurePolicyRule: nil, + expectedJob: failedJob1, + }, + { + name: "failure policy rule matches first job to fail out of all jobs", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{rule1}, + }).Obj(), + failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + + expectedFailurePolicyRule: &rule1, + expectedJob: failedJob1, + }, + { + name: "failure policy rule matches second job to fail out of all jobs", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{rule2}, + }).Obj(), + failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + + expectedFailurePolicyRule: &rule2, + expectedJob: failedJob2, + }, + { + name: "failure policy rule matches multiple jobs and first failed job is the last one", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{rule2}, + }).Obj(), + failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, + + expectedFailurePolicyRule: &rule2, + expectedJob: failedJob2, + }, + { + name: "first failed job that matches a failure policy rule is different from the first job to fail that matches the first matched failure policy rule", + js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{rule2, rule1}, + }).Obj(), + // failedJob1 is the first failedJob1 but does not match rule2 which is the first failure policy rule to be matched + failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, + + expectedFailurePolicyRule: &rule2, + expectedJob: failedJob2, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + actualRule, actualJob := findFirstFailedPolicyRuleAndJob(context.TODO(), tc.js, tc.failedOwnedJobs) + if diff := cmp.Diff(tc.expectedJob, actualJob); diff != "" { + t.Errorf("unexpected finished value (+got/-want): %s", diff) + } + if diff := cmp.Diff(tc.expectedFailurePolicyRule, actualRule); diff != "" { + t.Errorf("unexpected finished value (+got/-want): %s", diff) + } + }) + } +} diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go index 9b713dc9..3ff68433 100644 --- a/pkg/controllers/jobset_controller.go +++ b/pkg/controllers/jobset_controller.go @@ -172,7 +172,10 @@ func (r *JobSetReconciler) reconcile(ctx context.Context, js *jobset.JobSet, upd // If any jobs have failed, execute the JobSet failure policy (if any). if len(ownedJobs.failed) > 0 { - executeFailurePolicy(ctx, js, ownedJobs, updateStatusOpts) + if err := executeFailurePolicy(ctx, js, ownedJobs, updateStatusOpts); err != nil { + log.Error(err, "executing failure policy") + return ctrl.Result{}, err + } return ctrl.Result{}, nil } @@ -608,46 +611,6 @@ func executeSuccessPolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi return false } -func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs, updateStatusOpts *statusUpdateOpts) { - // If no failure policy is defined, mark the JobSet as failed. - if js.Spec.FailurePolicy == nil { - // firstFailedJob is only computed if necessary since it is expensive to compute - // for JobSets with many child jobs. This is why we don't unconditionally compute - // it once at the beginning of the function and share the results between the different - // possible code paths here. - firstFailedJob := findFirstFailedJob(ownedJobs.failed) - setJobSetFailedCondition(ctx, js, constants.FailedJobsReason, messageWithFirstFailedJob(constants.FailedJobsMessage, firstFailedJob.Name), updateStatusOpts) - return - } - - // If JobSet has reached max restarts, fail the JobSet. - if js.Status.Restarts >= js.Spec.FailurePolicy.MaxRestarts { - firstFailedJob := findFirstFailedJob(ownedJobs.failed) - setJobSetFailedCondition(ctx, js, constants.ReachedMaxRestartsReason, messageWithFirstFailedJob(constants.ReachedMaxRestartsMessage, firstFailedJob.Name), updateStatusOpts) - return - } - - // To reach this point a job must have failed. - failurePolicyRecreateAll(ctx, js, updateStatusOpts) -} - -func failurePolicyRecreateAll(ctx context.Context, js *jobset.JobSet, updateStatusOpts *statusUpdateOpts) { - log := ctrl.LoggerFrom(ctx) - - // Increment JobSet restarts. This will trigger reconciliation and result in deletions - // of old jobs not part of the current jobSet run. - js.Status.Restarts += 1 - updateStatusOpts.shouldUpdate = true - - // Emit event for each JobSet restarts for observability and debugability. - enqueueEvent(updateStatusOpts, &eventParams{ - object: js, - eventType: corev1.EventTypeWarning, - eventReason: fmt.Sprintf("restarting jobset, attempt %d", js.Status.Restarts), - }) - log.V(2).Info("attempting restart", "restart attempt", js.Status.Restarts) -} - func constructJobsFromTemplate(js *jobset.JobSet, rjob *jobset.ReplicatedJob, ownedJobs *childJobs) ([]*batchv1.Job, error) { var jobs []*batchv1.Job for jobIdx := 0; jobIdx < int(rjob.Replicas); jobIdx++ { @@ -883,21 +846,31 @@ func findFirstFailedJob(failedJobs []*batchv1.Job) *batchv1.Job { return firstFailedJob } -// findJobFailureTime is a helper function which extracts the Job failure time from a Job, +// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, // if the JobFailed condition exists and is true. -func findJobFailureTime(job *batchv1.Job) *metav1.Time { +func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { if job == nil { return nil } for _, c := range job.Status.Conditions { // If this Job failed before the oldest known Job failiure, update the first failed job. if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { - return &c.LastTransitionTime + return &c } } return nil } +// findJobFailureTime is a helper function which extracts the Job failure time from a Job, +// if the JobFailed condition exists and is true. +func findJobFailureTime(job *batchv1.Job) *metav1.Time { + failureCondition := findJobFailureCondition(job) + if failureCondition == nil { + return nil + } + return &failureCondition.LastTransitionTime +} + // managedByExternalController returns a pointer to the name of the external controller managing // the JobSet, if one exists. Otherwise, it returns nil. func managedByExternalController(js *jobset.JobSet) *string { @@ -927,18 +900,23 @@ func setCondition(js *jobset.JobSet, condOpts *conditionOpts, updateStatusOpts * return } + if updateStatusOpts == nil { + updateStatusOpts = &statusUpdateOpts{} + } + // If we made some changes to the status conditions, configure updateStatusOpts // to persist the status update at the end of the reconciliation attempt. updateStatusOpts.shouldUpdate = true // Conditionally emit an event for each JobSet condition update if and only if // the status update call is successful. - enqueueEvent(updateStatusOpts, &eventParams{ + event := &eventParams{ object: js, eventType: condOpts.eventType, eventReason: condOpts.condition.Reason, eventMessage: condOpts.condition.Message, - }) + } + enqueueEvent(updateStatusOpts, event) } // updateCondition accepts a given condition and does one of the following: diff --git a/pkg/controllers/jobset_controller_test.go b/pkg/controllers/jobset_controller_test.go index 7d00e35a..c4bc3f5c 100644 --- a/pkg/controllers/jobset_controller_test.go +++ b/pkg/controllers/jobset_controller_test.go @@ -1158,18 +1158,48 @@ func TestFindFirstFailedJob(t *testing.T) { // Helper function to create a job object with a failed condition func jobWithFailedCondition(name string, failureTime time.Time) *batchv1.Job { - return &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: name}, + return jobWithFailedConditionAndOpts(name, failureTime, nil) +} + +type failJobOptions struct { + reason *string + parentReplicatedJobName *string +} + +// Helper function to create a job object with a failed condition +func jobWithFailedConditionAndOpts(name string, failureTime time.Time, opts *failJobOptions) *batchv1.Job { + var reason string + labels := make(map[string]string) + applyOpts := func() { + if opts == nil { + return + } + + if opts.reason != nil { + reason = *opts.reason + } + + if opts.parentReplicatedJobName != nil { + labels[jobset.ReplicatedJobNameKey] = *opts.parentReplicatedJobName + } + } + applyOpts() + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: name, Labels: labels}, Status: batchv1.JobStatus{ Conditions: []batchv1.JobCondition{ { Type: batchv1.JobFailed, Status: corev1.ConditionTrue, LastTransitionTime: metav1.NewTime(failureTime), + Reason: reason, }, }, }, } + + return job } type makeJobArgs struct { diff --git a/pkg/webhooks/jobset_webhook.go b/pkg/webhooks/jobset_webhook.go index e26cd114..edef50df 100644 --- a/pkg/webhooks/jobset_webhook.go +++ b/pkg/webhooks/jobset_webhook.go @@ -61,6 +61,17 @@ const ( subdomainTooLongErrMsg = ".spec.network.subdomain is too long, must be less than 63 characters" ) +// validOnJobFailureReasons stores supported values of the reason field of the condition of +// a failed job. See https://github.com/kubernetes/api/blob/2676848ed8201866119a94759a2d525ffc7396c0/batch/v1/types.go#L632 +// for more details. +var validOnJobFailureReasons = []string{ + batchv1.JobReasonBackoffLimitExceeded, + batchv1.JobReasonDeadlineExceeded, + batchv1.JobReasonFailedIndexes, + batchv1.JobReasonMaxFailedIndexesExceeded, + batchv1.JobReasonPodFailurePolicy, +} + //+kubebuilder:webhook:path=/mutate-jobset-x-k8s-io-v1alpha2-jobset,mutating=true,failurePolicy=fail,sideEffects=None,groups=jobset.x-k8s.io,resources=jobsets,verbs=create;update,versions=v1alpha2,name=mjobset.kb.io,admissionReviewVersions=v1 // jobSetWebhook for defaulting and admission. @@ -207,6 +218,25 @@ func (j *jobSetWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) allErrs = append(allErrs, fmt.Errorf("invalid replicatedJob name '%s' does not appear in .spec.ReplicatedJobs", rjobName)) } } + + // Validate failure policy + if js.Spec.FailurePolicy != nil { + for _, rule := range js.Spec.FailurePolicy.Rules { + // Validate the rules target replicated jobs are valid + for _, rjobName := range rule.TargetReplicatedJobs { + if !collections.Contains(validReplicatedJobs, rjobName) { + allErrs = append(allErrs, fmt.Errorf("invalid replicatedJob name '%s' in failure policy does not appear in .spec.ReplicatedJobs", rjobName)) + } + } + + // Validate the rules on job failure reasons are valid + for _, failureReason := range rule.OnJobFailureReasons { + if !collections.Contains(validOnJobFailureReasons, failureReason) { + allErrs = append(allErrs, fmt.Errorf("invalid job failure reason '%s' in failure policy is not a recognized job failure reason", failureReason)) + } + } + } + } return nil, errors.Join(allErrs...) } diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index 52cbcd80..9068bf90 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -969,6 +969,74 @@ func TestValidateCreate(t *testing.T) { }, want: errors.Join(), }, + { + name: "jobset failure policy has an invalid on job failure reason", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + FailurePolicy: &jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{"fakeReason"}, + }, + }, + }, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid job failure reason '%s' in failure policy is not a recognized job failure reason", "fakeReason"), + ), + }, + { + name: "jobset failure policy has an invalid replicated job", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + FailurePolicy: &jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + TargetReplicatedJobs: []string{"fakeReplicatedJob"}, + }, + }, + }, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid replicatedJob name '%s' in failure policy does not appear in .spec.ReplicatedJobs", "fakeReplicatedJob"), + ), + }, } fakeClient := fake.NewFakeClient() webhook, err := NewJobSetWebhook(fakeClient) diff --git a/sdk/python/README.md b/sdk/python/README.md index affd3472..9d6faf71 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -65,6 +65,7 @@ Class | Method | HTTP request | Description ## Documentation For Models - [JobsetV1alpha2FailurePolicy](docs/JobsetV1alpha2FailurePolicy.md) + - [JobsetV1alpha2FailurePolicyRule](docs/JobsetV1alpha2FailurePolicyRule.md) - [JobsetV1alpha2JobSet](docs/JobsetV1alpha2JobSet.md) - [JobsetV1alpha2JobSetList](docs/JobsetV1alpha2JobSetList.md) - [JobsetV1alpha2JobSetSpec](docs/JobsetV1alpha2JobSetSpec.md) diff --git a/sdk/python/docs/JobsetV1alpha2FailurePolicy.md b/sdk/python/docs/JobsetV1alpha2FailurePolicy.md index 4df59ee1..e7b5a2d6 100644 --- a/sdk/python/docs/JobsetV1alpha2FailurePolicy.md +++ b/sdk/python/docs/JobsetV1alpha2FailurePolicy.md @@ -4,6 +4,7 @@ Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- **max_restarts** | **int** | MaxRestarts defines the limit on the number of JobSet restarts. A restart is achieved by recreating all active child jobs. | [optional] +**rules** | [**list[JobsetV1alpha2FailurePolicyRule]**](JobsetV1alpha2FailurePolicyRule.md) | List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied. | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md b/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md new file mode 100644 index 00000000..1b98164a --- /dev/null +++ b/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md @@ -0,0 +1,13 @@ +# JobsetV1alpha2FailurePolicyRule + +FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fails due to a reason listed in OnJobFailureReasons. +## Properties +Name | Type | Description | Notes +------------ | ------------- | ------------- | ------------- +**action** | **str** | The action to take if the rule is matched. | [default to ''] +**on_job_failure_reasons** | **list[str]** | The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason. | +**target_replicated_jobs** | **list[str]** | TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs. | [optional] + +[[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) + + diff --git a/sdk/python/docs/JobsetV1alpha2JobSetStatus.md b/sdk/python/docs/JobsetV1alpha2JobSetStatus.md index b5262dab..3a223368 100644 --- a/sdk/python/docs/JobsetV1alpha2JobSetStatus.md +++ b/sdk/python/docs/JobsetV1alpha2JobSetStatus.md @@ -7,6 +7,7 @@ Name | Type | Description | Notes **conditions** | [**list[V1Condition]**](V1Condition.md) | | [optional] **replicated_jobs_status** | [**list[JobsetV1alpha2ReplicatedJobStatus]**](JobsetV1alpha2ReplicatedJobStatus.md) | ReplicatedJobsStatus track the number of JobsReady for each replicatedJob. | [optional] **restarts** | **int** | Restarts tracks the number of times the JobSet has restarted (i.e. recreated in case of RecreateAll policy). | [optional] +**restarts_count_towards_max** | **int** | RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts. | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/sdk/python/jobset/__init__.py b/sdk/python/jobset/__init__.py index 591721ac..4f0e78d6 100644 --- a/sdk/python/jobset/__init__.py +++ b/sdk/python/jobset/__init__.py @@ -28,6 +28,7 @@ from jobset.exceptions import ApiException # import models into sdk package from jobset.models.jobset_v1alpha2_failure_policy import JobsetV1alpha2FailurePolicy +from jobset.models.jobset_v1alpha2_failure_policy_rule import JobsetV1alpha2FailurePolicyRule from jobset.models.jobset_v1alpha2_job_set import JobsetV1alpha2JobSet from jobset.models.jobset_v1alpha2_job_set_list import JobsetV1alpha2JobSetList from jobset.models.jobset_v1alpha2_job_set_spec import JobsetV1alpha2JobSetSpec diff --git a/sdk/python/jobset/models/__init__.py b/sdk/python/jobset/models/__init__.py index c3632357..26cb70c1 100644 --- a/sdk/python/jobset/models/__init__.py +++ b/sdk/python/jobset/models/__init__.py @@ -18,6 +18,7 @@ # import models into model package from jobset.models.jobset_v1alpha2_failure_policy import JobsetV1alpha2FailurePolicy +from jobset.models.jobset_v1alpha2_failure_policy_rule import JobsetV1alpha2FailurePolicyRule from jobset.models.jobset_v1alpha2_job_set import JobsetV1alpha2JobSet from jobset.models.jobset_v1alpha2_job_set_list import JobsetV1alpha2JobSetList from jobset.models.jobset_v1alpha2_job_set_spec import JobsetV1alpha2JobSetSpec diff --git a/sdk/python/jobset/models/jobset_v1alpha2_failure_policy.py b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy.py index 5b57e6ca..0e768064 100644 --- a/sdk/python/jobset/models/jobset_v1alpha2_failure_policy.py +++ b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy.py @@ -33,24 +33,29 @@ class JobsetV1alpha2FailurePolicy(object): and the value is json key in definition. """ openapi_types = { - 'max_restarts': 'int' + 'max_restarts': 'int', + 'rules': 'list[JobsetV1alpha2FailurePolicyRule]' } attribute_map = { - 'max_restarts': 'maxRestarts' + 'max_restarts': 'maxRestarts', + 'rules': 'rules' } - def __init__(self, max_restarts=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, max_restarts=None, rules=None, local_vars_configuration=None): # noqa: E501 """JobsetV1alpha2FailurePolicy - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() self.local_vars_configuration = local_vars_configuration self._max_restarts = None + self._rules = None self.discriminator = None if max_restarts is not None: self.max_restarts = max_restarts + if rules is not None: + self.rules = rules @property def max_restarts(self): @@ -75,6 +80,29 @@ def max_restarts(self, max_restarts): self._max_restarts = max_restarts + @property + def rules(self): + """Gets the rules of this JobsetV1alpha2FailurePolicy. # noqa: E501 + + List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied. # noqa: E501 + + :return: The rules of this JobsetV1alpha2FailurePolicy. # noqa: E501 + :rtype: list[JobsetV1alpha2FailurePolicyRule] + """ + return self._rules + + @rules.setter + def rules(self, rules): + """Sets the rules of this JobsetV1alpha2FailurePolicy. + + List of failure policy rules for this JobSet. For a given Job failure, the rules will be evaluated in order, and only the first matching rule will be executed. If no matching rule is found, the RestartJobSet action is applied. # noqa: E501 + + :param rules: The rules of this JobsetV1alpha2FailurePolicy. # noqa: E501 + :type: list[JobsetV1alpha2FailurePolicyRule] + """ + + self._rules = rules + def to_dict(self): """Returns the model properties as a dict""" result = {} diff --git a/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py new file mode 100644 index 00000000..3d5871d4 --- /dev/null +++ b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py @@ -0,0 +1,180 @@ +# coding: utf-8 + +""" + JobSet SDK + + Python SDK for the JobSet API # noqa: E501 + + The version of the OpenAPI document: v0.1.4 + Generated by: https://openapi-generator.tech +""" + + +import pprint +import re # noqa: F401 + +import six + +from jobset.configuration import Configuration + + +class JobsetV1alpha2FailurePolicyRule(object): + """NOTE: This class is auto generated by OpenAPI Generator. + Ref: https://openapi-generator.tech + + Do not edit the class manually. + """ + + """ + Attributes: + openapi_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + openapi_types = { + 'action': 'str', + 'on_job_failure_reasons': 'list[str]', + 'target_replicated_jobs': 'list[str]' + } + + attribute_map = { + 'action': 'action', + 'on_job_failure_reasons': 'onJobFailureReasons', + 'target_replicated_jobs': 'targetReplicatedJobs' + } + + def __init__(self, action='', on_job_failure_reasons=None, target_replicated_jobs=None, local_vars_configuration=None): # noqa: E501 + """JobsetV1alpha2FailurePolicyRule - a model defined in OpenAPI""" # noqa: E501 + if local_vars_configuration is None: + local_vars_configuration = Configuration() + self.local_vars_configuration = local_vars_configuration + + self._action = None + self._on_job_failure_reasons = None + self._target_replicated_jobs = None + self.discriminator = None + + self.action = action + self.on_job_failure_reasons = on_job_failure_reasons + if target_replicated_jobs is not None: + self.target_replicated_jobs = target_replicated_jobs + + @property + def action(self): + """Gets the action of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + + The action to take if the rule is matched. # noqa: E501 + + :return: The action of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :rtype: str + """ + return self._action + + @action.setter + def action(self, action): + """Sets the action of this JobsetV1alpha2FailurePolicyRule. + + The action to take if the rule is matched. # noqa: E501 + + :param action: The action of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :type: str + """ + if self.local_vars_configuration.client_side_validation and action is None: # noqa: E501 + raise ValueError("Invalid value for `action`, must not be `None`") # noqa: E501 + + self._action = action + + @property + def on_job_failure_reasons(self): + """Gets the on_job_failure_reasons of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + + The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason. # noqa: E501 + + :return: The on_job_failure_reasons of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :rtype: list[str] + """ + return self._on_job_failure_reasons + + @on_job_failure_reasons.setter + def on_job_failure_reasons(self, on_job_failure_reasons): + """Sets the on_job_failure_reasons of this JobsetV1alpha2FailurePolicyRule. + + The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason. # noqa: E501 + + :param on_job_failure_reasons: The on_job_failure_reasons of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :type: list[str] + """ + if self.local_vars_configuration.client_side_validation and on_job_failure_reasons is None: # noqa: E501 + raise ValueError("Invalid value for `on_job_failure_reasons`, must not be `None`") # noqa: E501 + + self._on_job_failure_reasons = on_job_failure_reasons + + @property + def target_replicated_jobs(self): + """Gets the target_replicated_jobs of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + + TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs. # noqa: E501 + + :return: The target_replicated_jobs of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :rtype: list[str] + """ + return self._target_replicated_jobs + + @target_replicated_jobs.setter + def target_replicated_jobs(self, target_replicated_jobs): + """Sets the target_replicated_jobs of this JobsetV1alpha2FailurePolicyRule. + + TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs. # noqa: E501 + + :param target_replicated_jobs: The target_replicated_jobs of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :type: list[str] + """ + + self._target_replicated_jobs = target_replicated_jobs + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.openapi_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, JobsetV1alpha2FailurePolicyRule): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, JobsetV1alpha2FailurePolicyRule): + return True + + return self.to_dict() != other.to_dict() diff --git a/sdk/python/jobset/models/jobset_v1alpha2_job_set_status.py b/sdk/python/jobset/models/jobset_v1alpha2_job_set_status.py index 1b50496e..e6486617 100644 --- a/sdk/python/jobset/models/jobset_v1alpha2_job_set_status.py +++ b/sdk/python/jobset/models/jobset_v1alpha2_job_set_status.py @@ -35,16 +35,18 @@ class JobsetV1alpha2JobSetStatus(object): openapi_types = { 'conditions': 'list[V1Condition]', 'replicated_jobs_status': 'list[JobsetV1alpha2ReplicatedJobStatus]', - 'restarts': 'int' + 'restarts': 'int', + 'restarts_count_towards_max': 'int' } attribute_map = { 'conditions': 'conditions', 'replicated_jobs_status': 'replicatedJobsStatus', - 'restarts': 'restarts' + 'restarts': 'restarts', + 'restarts_count_towards_max': 'restartsCountTowardsMax' } - def __init__(self, conditions=None, replicated_jobs_status=None, restarts=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, conditions=None, replicated_jobs_status=None, restarts=None, restarts_count_towards_max=None, local_vars_configuration=None): # noqa: E501 """JobsetV1alpha2JobSetStatus - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -53,6 +55,7 @@ def __init__(self, conditions=None, replicated_jobs_status=None, restarts=None, self._conditions = None self._replicated_jobs_status = None self._restarts = None + self._restarts_count_towards_max = None self.discriminator = None if conditions is not None: @@ -61,6 +64,8 @@ def __init__(self, conditions=None, replicated_jobs_status=None, restarts=None, self.replicated_jobs_status = replicated_jobs_status if restarts is not None: self.restarts = restarts + if restarts_count_towards_max is not None: + self.restarts_count_towards_max = restarts_count_towards_max @property def conditions(self): @@ -129,6 +134,29 @@ def restarts(self, restarts): self._restarts = restarts + @property + def restarts_count_towards_max(self): + """Gets the restarts_count_towards_max of this JobsetV1alpha2JobSetStatus. # noqa: E501 + + RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts. # noqa: E501 + + :return: The restarts_count_towards_max of this JobsetV1alpha2JobSetStatus. # noqa: E501 + :rtype: int + """ + return self._restarts_count_towards_max + + @restarts_count_towards_max.setter + def restarts_count_towards_max(self, restarts_count_towards_max): + """Sets the restarts_count_towards_max of this JobsetV1alpha2JobSetStatus. + + RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts. # noqa: E501 + + :param restarts_count_towards_max: The restarts_count_towards_max of this JobsetV1alpha2JobSetStatus. # noqa: E501 + :type: int + """ + + self._restarts_count_towards_max = restarts_count_towards_max + def to_dict(self): """Returns the model properties as a dict""" result = {} diff --git a/sdk/python/test/test_jobset_v1alpha2_failure_policy.py b/sdk/python/test/test_jobset_v1alpha2_failure_policy.py index f0b6d68c..b0498b70 100644 --- a/sdk/python/test/test_jobset_v1alpha2_failure_policy.py +++ b/sdk/python/test/test_jobset_v1alpha2_failure_policy.py @@ -38,7 +38,17 @@ def make_instance(self, include_optional): # model = jobset.models.jobset_v1alpha2_failure_policy.JobsetV1alpha2FailurePolicy() # noqa: E501 if include_optional : return JobsetV1alpha2FailurePolicy( - max_restarts = 56 + max_restarts = 56, + rules = [ + jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ], ) + ] ) else : return JobsetV1alpha2FailurePolicy( diff --git a/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py b/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py new file mode 100644 index 00000000..f5740fc7 --- /dev/null +++ b/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py @@ -0,0 +1,64 @@ +# coding: utf-8 + +""" + JobSet SDK + + Python SDK for the JobSet API # noqa: E501 + + The version of the OpenAPI document: v0.1.4 + Generated by: https://openapi-generator.tech +""" + + +from __future__ import absolute_import + +# Kubernetes imports +from kubernetes.client.models.v1_job_template_spec import V1JobTemplateSpec +import unittest +import datetime + +import jobset +from jobset.models.jobset_v1alpha2_failure_policy_rule import JobsetV1alpha2FailurePolicyRule # noqa: E501 +from jobset.rest import ApiException + +class TestJobsetV1alpha2FailurePolicyRule(unittest.TestCase): + """JobsetV1alpha2FailurePolicyRule unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional): + """Test JobsetV1alpha2FailurePolicyRule + include_option is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # model = jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule() # noqa: E501 + if include_optional : + return JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ] + ) + else : + return JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + ) + + def testJobsetV1alpha2FailurePolicyRule(self): + """Test JobsetV1alpha2FailurePolicyRule""" + inst_req_only = self.make_instance(include_optional=False) + inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set.py b/sdk/python/test/test_jobset_v1alpha2_job_set.py index c16e2b37..80bb1478 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set.py @@ -43,7 +43,17 @@ def make_instance(self, include_optional): metadata = None, spec = jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec( failure_policy = jobset.models.jobset_v1alpha2_failure_policy.JobsetV1alpha2FailurePolicy( - max_restarts = 56, ), + max_restarts = 56, + rules = [ + jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ], ) + ], ), managed_by = '0', network = jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network( enable_dns_hostnames = True, @@ -58,10 +68,7 @@ def make_instance(self, include_optional): startup_policy = jobset.models.jobset_v1alpha2_startup_policy.JobsetV1alpha2StartupPolicy( startup_policy_order = '0', ), success_policy = jobset.models.jobset_v1alpha2_success_policy.JobsetV1alpha2SuccessPolicy( - operator = '0', - target_replicated_jobs = [ - '0' - ], ), + operator = '0', ), suspend = True, ttl_seconds_after_finished = 56, ), status = jobset.models.jobset_v1alpha2_job_set_status.JobsetV1alpha2JobSetStatus( @@ -77,7 +84,8 @@ def make_instance(self, include_optional): succeeded = 56, suspended = 56, ) ], - restarts = 56, ) + restarts = 56, + restarts_count_towards_max = 56, ) ) else : return JobsetV1alpha2JobSet( diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set_list.py b/sdk/python/test/test_jobset_v1alpha2_job_set_list.py index 662f81d9..d1be43fe 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set_list.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set_list.py @@ -46,7 +46,17 @@ def make_instance(self, include_optional): metadata = None, spec = jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec( failure_policy = jobset.models.jobset_v1alpha2_failure_policy.JobsetV1alpha2FailurePolicy( - max_restarts = 56, ), + max_restarts = 56, + rules = [ + jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ], ) + ], ), managed_by = '0', network = jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network( enable_dns_hostnames = True, @@ -61,10 +71,7 @@ def make_instance(self, include_optional): startup_policy = jobset.models.jobset_v1alpha2_startup_policy.JobsetV1alpha2StartupPolicy( startup_policy_order = '0', ), success_policy = jobset.models.jobset_v1alpha2_success_policy.JobsetV1alpha2SuccessPolicy( - operator = '0', - target_replicated_jobs = [ - '0' - ], ), + operator = '0', ), suspend = True, ttl_seconds_after_finished = 56, ), status = jobset.models.jobset_v1alpha2_job_set_status.JobsetV1alpha2JobSetStatus( @@ -80,7 +87,8 @@ def make_instance(self, include_optional): succeeded = 56, suspended = 56, ) ], - restarts = 56, ), ) + restarts = 56, + restarts_count_towards_max = 56, ), ) ], kind = '0', metadata = None @@ -94,7 +102,17 @@ def make_instance(self, include_optional): metadata = None, spec = jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec( failure_policy = jobset.models.jobset_v1alpha2_failure_policy.JobsetV1alpha2FailurePolicy( - max_restarts = 56, ), + max_restarts = 56, + rules = [ + jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ], ) + ], ), managed_by = '0', network = jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network( enable_dns_hostnames = True, @@ -109,10 +127,7 @@ def make_instance(self, include_optional): startup_policy = jobset.models.jobset_v1alpha2_startup_policy.JobsetV1alpha2StartupPolicy( startup_policy_order = '0', ), success_policy = jobset.models.jobset_v1alpha2_success_policy.JobsetV1alpha2SuccessPolicy( - operator = '0', - target_replicated_jobs = [ - '0' - ], ), + operator = '0', ), suspend = True, ttl_seconds_after_finished = 56, ), status = jobset.models.jobset_v1alpha2_job_set_status.JobsetV1alpha2JobSetStatus( @@ -128,7 +143,8 @@ def make_instance(self, include_optional): succeeded = 56, suspended = 56, ) ], - restarts = 56, ), ) + restarts = 56, + restarts_count_towards_max = 56, ), ) ], ) diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py b/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py index 6671320c..4010e5fd 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py @@ -39,7 +39,17 @@ def make_instance(self, include_optional): if include_optional : return JobsetV1alpha2JobSetSpec( failure_policy = jobset.models.jobset_v1alpha2_failure_policy.JobsetV1alpha2FailurePolicy( - max_restarts = 56, ), + max_restarts = 56, + rules = [ + jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( + action = '0', + on_job_failure_reasons = [ + '0' + ], + target_replicated_jobs = [ + '0' + ], ) + ], ), managed_by = '0', network = jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network( enable_dns_hostnames = True, diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set_status.py b/sdk/python/test/test_jobset_v1alpha2_job_set_status.py index 293a2ee8..d7457001 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set_status.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set_status.py @@ -50,7 +50,8 @@ def make_instance(self, include_optional): succeeded = 56, suspended = 56, ) ], - restarts = 56 + restarts = 56, + restarts_count_towards_max = 56 ) else : return JobsetV1alpha2JobSetStatus( diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go index 5afa01df..ddf73701 100644 --- a/test/integration/controller/jobset_controller_test.go +++ b/test/integration/controller/jobset_controller_test.go @@ -443,6 +443,328 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), + ginkgo.Entry("jobset fails immediately with FailJobSet failure policy action.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetFailed, + }, + }, + }), + ginkgo.Entry("jobset does not fail immediately with FailJobSet failure policy action.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + }, + }), + ginkgo.Entry("jobset restarts with RestartJobSet Failure Policy Action.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonPodFailurePolicy}, + }, + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + }, + }), + ginkgo.Entry("jobset restarts with RestartJobSetAndIgnoremaxRestarts Failure Policy Action.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.RestartJobSetAndIgnoreMaxRestarts, + OnJobFailureReasons: []string{batchv1.JobReasonPodFailurePolicy}, + }, + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failJobWithOptions(&jobList.Items[0], &failJobOptions{reason: ptr.To(batchv1.JobReasonPodFailurePolicy)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + }, + }), + ginkgo.Entry("job fails and the parent replicated job is contained in TargetReplicatedJobs.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonFailedIndexes}, + TargetReplicatedJobs: []string{"replicated-job-b"}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-b", &failJobOptions{reason: ptr.To(batchv1.JobReasonFailedIndexes)}) + }, + checkJobSetCondition: testutil.JobSetFailed, + }, + }, + }), + ginkgo.Entry("job fails and the parent replicated job is not contained in TargetReplicatedJobs.", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-b", &failJobOptions{reason: ptr.To(batchv1.JobReasonBackoffLimitExceeded)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + checkJobSetCondition: testutil.JobSetActive, + }, + }, + }), + ginkgo.Entry("failure policy rules order verification test 1", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + { + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-a", &failJobOptions{reason: ptr.To(batchv1.JobReasonMaxFailedIndexesExceeded)}) + }, + checkJobSetCondition: testutil.JobSetFailed, + }, + }, + }), + ginkgo.Entry("failure policy rules order verification test 2", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.RestartJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-a", &failJobOptions{reason: ptr.To(batchv1.JobReasonMaxFailedIndexesExceeded)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + }, + }), + ginkgo.Entry("failure policy rules order verification test 3", &testCase{ + makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { + return testJobSet(ns). + FailurePolicy(&jobset.FailurePolicy{ + MaxRestarts: 1, + Rules: []jobset.FailurePolicyRule{ + { + Action: jobset.RestartJobSetAndIgnoreMaxRestarts, + OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded}, + TargetReplicatedJobs: []string{"replicated-job-a"}, + }, + { + Action: jobset.FailJobSet, + OnJobFailureReasons: []string{}, + TargetReplicatedJobs: []string{}, + }, + }, + }) + }, + updates: []*update{ + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-a", &failJobOptions{reason: ptr.To(batchv1.JobReasonMaxFailedIndexesExceeded)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-a", &failJobOptions{reason: ptr.To(batchv1.JobReasonMaxFailedIndexesExceeded)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJobWithOptions(jobList, "replicated-job-a", &failJobOptions{reason: ptr.To(batchv1.JobReasonMaxFailedIndexesExceeded)}) + }, + checkJobSetCondition: testutil.JobSetActive, + }, + { + jobSetUpdateFn: func(js *jobset.JobSet) { + // For a restart, all jobs will be deleted and recreated, so we expect a + // foreground deletion finalizer for every job. + removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) + }, + }, + { + jobUpdateFn: func(jobList *batchv1.JobList) { + failFirstMatchingJob(jobList, "replicated-job-b") + }, + checkJobSetCondition: testutil.JobSetFailed, + }, + }, + }), ginkgo.Entry("job succeeds after one failure", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). @@ -1442,17 +1764,55 @@ func updateJobStatus(job *batchv1.Job, status batchv1.JobStatus) { jobGet.Status = status return k8sClient.Status().Update(ctx, &jobGet) }, timeout, interval).Should(gomega.Succeed()) + } -func failJob(job *batchv1.Job) { +type failJobOptions struct { + reason *string +} + +func failJobWithOptions(job *batchv1.Job, failJobOpts *failJobOptions) { + if failJobOpts == nil { + failJobOpts = &failJobOptions{} + } updateJobStatus(job, batchv1.JobStatus{ Conditions: append(job.Status.Conditions, batchv1.JobCondition{ Type: batchv1.JobFailed, Status: corev1.ConditionTrue, + Reason: ptr.Deref(failJobOpts.reason, ""), }), }) } +func failJob(job *batchv1.Job) { + failJobWithOptions(job, nil) +} + +// failFirstMatchingJobWithOptions fails the first matching job (in terms of index in jobList) that is a child of +// replicatedJobName with extra options. No job is failed if a matching job does not exist. +func failFirstMatchingJobWithOptions(jobList *batchv1.JobList, replicatedJobName string, failJobOpts *failJobOptions) { + if jobList == nil { + return + } + if failJobOpts == nil { + failJobOpts = &failJobOptions{} + } + + for _, job := range jobList.Items { + parentReplicatedJob := job.Labels[jobset.ReplicatedJobNameKey] + if parentReplicatedJob == replicatedJobName { + failJobWithOptions(&job, failJobOpts) + return + } + } +} + +// failFirstMatchingJob fails the first matching job (in terms of index in jobList) that is a child of +// replicatedJobName. No job is failed if a matching job does not exist. +func failFirstMatchingJob(jobList *batchv1.JobList, replicatedJobName string) { + failFirstMatchingJobWithOptions(jobList, replicatedJobName, nil) +} + func suspendJobSet(js *jobset.JobSet, suspend bool) { gomega.Eventually(func() error { var jsGet jobset.JobSet From a3cecb17588ef836984b5261b1ee5cdd427aa2bf Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Fri, 3 May 2024 21:46:55 +0000 Subject: [PATCH 02/25] Refactor findFirstFailedPolicyRuleAndJob, ruleIsApplicable, and TestFindFirstFailedPolicyRuleAndJob. --- pkg/controllers/failure_policy.go | 98 +++++++++----------------- pkg/controllers/failure_policy_test.go | 63 ++++++----------- 2 files changed, 55 insertions(+), 106 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 4c2162d5..a231faf0 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -47,15 +47,20 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi // it once at the beginning of the function and share the results between the different // possible code paths here. firstFailedJob := findFirstFailedJob(ownedJobs.failed) - setJobSetFailedCondition(ctx, js, constants.FailedJobsReason, messageWithFirstFailedJob(constants.FailedJobsMessage, firstFailedJob.Name), updateStatusOpts) + msg := messageWithFirstFailedJob(constants.FailedJobsMessage, firstFailedJob.Name) + setJobSetFailedCondition(ctx, js, constants.FailedJobsReason, msg, updateStatusOpts) return nil } // Check for matching Failure Policy Rule - failurePolicyRule, matchingFailedJob := findFirstFailedPolicyRuleAndJob(ctx, js, ownedJobs.failed) + rules := js.Spec.FailurePolicy.Rules + failurePolicyRule, matchingFailedJob := findFirstFailedPolicyRuleAndJob(ctx, rules, ownedJobs.failed) - failurePolicyRuleAction := jobset.RestartJobSet - if failurePolicyRule != nil { + var failurePolicyRuleAction jobset.FailurePolicyAction + if failurePolicyRule == nil { + failurePolicyRuleAction = jobset.RestartJobSet + matchingFailedJob = findFirstFailedJob(ownedJobs.failed) + } else { failurePolicyRuleAction = failurePolicyRule.Action } @@ -67,15 +72,9 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi return nil } -// If the failure policy rule is not nil, then the functions returns: -// - true if the rule is applicable to the jobFailureReason; -// - false other. -// If the rule is nil, we return true. -func ruleIsApplicable(rule *jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { - if rule == nil { - return true - } - +// ruleIsApplicable returns true if the failed job and job failure reason match the failure policy rule. +// The function returns false otherwise. +func ruleIsApplicable(rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { ruleAppliesToJobFailureReason := len(rule.OnJobFailureReasons) == 0 || slices.Contains(rule.OnJobFailureReasons, jobFailureReason) if !ruleAppliesToJobFailureReason { return false @@ -93,68 +92,37 @@ func ruleIsApplicable(rule *jobset.FailurePolicyRule, failedJob *batchv1.Job, jo return ruleAppliesToParentReplicatedJob } -// The function findFirstFailedPolicyRuleAndJob returns the first failure policy rule matching a failed child job. +// findFirstFailedPolicyRuleAndJob returns the first failure policy rule matching a failed child job. // The function also returns the first child job matching the failure policy rule returned. -// If there does not exist a matching failure policy rule, the function returns nil for the failure policy rule -// accompanied with the first failing job. This function assumes that the jobset has a non nil failure policy. -func findFirstFailedPolicyRuleAndJob(ctx context.Context, js *jobset.JobSet, failedOwnedJobs []*batchv1.Job) (*jobset.FailurePolicyRule, *batchv1.Job) { +// If there does not exist a matching failure policy rule, then the function returns nil for all values. +func findFirstFailedPolicyRuleAndJob(ctx context.Context, rules []jobset.FailurePolicyRule, failedOwnedJobs []*batchv1.Job) (*jobset.FailurePolicyRule, *batchv1.Job) { log := ctrl.LoggerFrom(ctx) - rulesExist := len(js.Spec.FailurePolicy.Rules) > 0 - if !rulesExist { - firstFailedJob := findFirstFailedJob(failedOwnedJobs) - return nil, firstFailedJob - } - - // These variables are only to make the ensuing lines of code shorter. - rules := js.Spec.FailurePolicy.Rules - numRules := len(js.Spec.FailurePolicy.Rules) - - // bucket[i] corresponds to js.Spec.FailurePolicy.Rules[i] - type bucket struct { - firstFailedJob *batchv1.Job - firstFailureTime *metav1.Time - } - - // We make a bucket for each rule and then an extra bucket to represent the default rule - numBuckets := numRules + 1 - defaultRuleIndex := numRules - var buckets = make([]bucket, numRules+1) - - for _, failedJob := range failedOwnedJobs { - jobFailureCondition := findJobFailureCondition(failedJob) - // This means that the Job has not failed. - if jobFailureCondition == nil { - continue - } - - jobFailureTime, jobFailureReason := ptr.To(jobFailureCondition.LastTransitionTime), jobFailureCondition.Reason - for i := 0; i < numBuckets; i++ { - // We use nil to represent the default rule that - // applies to all job failure reasons. - var rule *jobset.FailurePolicyRule - if i < numRules { - rule = ptr.To(rules[i]) + for _, rule := range rules { + var matchedFailedJob *batchv1.Job + var matchedFailureTime *metav1.Time + for _, failedJob := range failedOwnedJobs { + jobFailureCondition := findJobFailureCondition(failedJob) + // This means that the Job has not failed. + if jobFailureCondition == nil { + continue } - if ruleIsApplicable(rule, failedJob, jobFailureReason) && (buckets[i].firstFailureTime == nil || jobFailureTime.Before(buckets[i].firstFailureTime)) { - buckets[i].firstFailedJob = failedJob - buckets[i].firstFailureTime = jobFailureTime + + jobFailureTime, jobFailureReason := ptr.To(jobFailureCondition.LastTransitionTime), jobFailureCondition.Reason + jobFailedEarlier := matchedFailedJob == nil || jobFailureTime.Before(matchedFailureTime) + if ruleIsApplicable(rule, failedJob, jobFailureReason) && jobFailedEarlier { + matchedFailedJob = failedJob + matchedFailureTime = jobFailureTime } } - } - // Checking if any failure policy rules were matched - // and returning the rule along with the first - // failed job to match it. - for i := 0; i < numRules; i++ { - if buckets[i].firstFailedJob != nil { - return &rules[i], buckets[i].firstFailedJob + if matchedFailedJob != nil { + return &rule, matchedFailedJob } } - // We get here when no rule matched any of the failure policy rules - log.V(2).Info("never found a matching rule and returning nil to represent the default rule.") - return nil, buckets[defaultRuleIndex].firstFailedJob + log.V(2).Info("never found a matched failure policy rule.") + return nil, nil } // failurePolicyRecreateAll triggers a JobSet restart for the next reconcillation loop. diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go index d6bb0699..23189340 100644 --- a/pkg/controllers/failure_policy_test.go +++ b/pkg/controllers/failure_policy_test.go @@ -36,18 +36,14 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { tests := []struct { name string - rule *jobset.FailurePolicyRule + rule jobset.FailurePolicyRule failedJob *batchv1.Job jobFailureReason string expected bool }{ - { - name: "failure policy rule is nil", - expected: true, - }, { name: "failure policy rule matches on job failure reason", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, TargetReplicatedJobs: []string{replicatedJobName1}, }, @@ -59,7 +55,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { }, { name: "failure policy rule matches all on job failure reason", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( @@ -70,7 +66,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { }, { name: "failure policy rule does not match on job failure reason", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, TargetReplicatedJobs: []string{replicatedJobName1}, }, @@ -82,7 +78,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { }, { name: "failure policy rule is not applicable to parent replicatedJob of failed job", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, TargetReplicatedJobs: []string{replicatedJobName1}, }, @@ -94,7 +90,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { }, { name: "failure policy rule is applicable to all replicatedjobs when targetedReplicatedJobs is omitted", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( @@ -105,7 +101,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { }, { name: "failure policy rule is applicable to parent replicatedJob when targetedReplicatedJobs is specified", - rule: &jobset.FailurePolicyRule{ + rule: jobset.FailurePolicyRule{ OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, TargetReplicatedJobs: []string{replicatedJobName1}, }, @@ -130,8 +126,6 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { var ( replicatedJobName = "test-replicatedJob" - jobSetName = "test-jobset" - ns = "default" failedJobNoReason1 = jobWithFailedCondition("job1", time.Now().Add(-6*time.Hour)) failedJobNoReason2 = jobWithFailedCondition("job2", time.Now().Add(-3*time.Hour)) @@ -180,7 +174,7 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { ) tests := []struct { name string - js *jobset.JobSet + rules []jobset.FailurePolicyRule failedOwnedJobs []*batchv1.Job expectedFailurePolicyRule *jobset.FailurePolicyRule @@ -188,7 +182,6 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { }{ { name: "failure policy rules are empty with no failed jobs", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), failedOwnedJobs: []*batchv1.Job{}, expectedFailurePolicyRule: nil, @@ -196,67 +189,55 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { }, { name: "failure policy rules are empty with one failed job", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), failedOwnedJobs: []*batchv1.Job{ failedJobNoReason1, }, expectedFailurePolicyRule: nil, - expectedJob: failedJobNoReason1, + expectedJob: nil, }, { name: "failure policy rules are empty with multiple failed jobs", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{}).Obj(), failedOwnedJobs: []*batchv1.Job{failedJobNoReason3, failedJobNoReason1, failedJobNoReason2}, expectedFailurePolicyRule: nil, - expectedJob: failedJobNoReason1, + expectedJob: nil, }, { - name: "failure policy rule does not match on job failure reasons", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{unmatchedRule}, - }).Obj(), + name: "failure policy rule does not match on job failure reasons", + rules: []jobset.FailurePolicyRule{unmatchedRule}, failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, expectedFailurePolicyRule: nil, - expectedJob: failedJob1, + expectedJob: nil, }, { - name: "failure policy rule matches first job to fail out of all jobs", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{rule1}, - }).Obj(), + name: "failure policy rule matches first job to fail out of all jobs", + rules: []jobset.FailurePolicyRule{rule1}, failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, expectedFailurePolicyRule: &rule1, expectedJob: failedJob1, }, { - name: "failure policy rule matches second job to fail out of all jobs", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{rule2}, - }).Obj(), + name: "failure policy rule matches second job to fail out of all jobs", + rules: []jobset.FailurePolicyRule{rule2}, failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, expectedFailurePolicyRule: &rule2, expectedJob: failedJob2, }, { - name: "failure policy rule matches multiple jobs and first failed job is the last one", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{rule2}, - }).Obj(), + name: "failure policy rule matches multiple jobs and first failed job is the last one", + rules: []jobset.FailurePolicyRule{rule2}, failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, expectedFailurePolicyRule: &rule2, expectedJob: failedJob2, }, { - name: "first failed job that matches a failure policy rule is different from the first job to fail that matches the first matched failure policy rule", - js: testutils.MakeJobSet(jobSetName, ns).FailurePolicy(&jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{rule2, rule1}, - }).Obj(), + name: "first failed job that matches a failure policy rule is different from the first job to fail that matches the first matched failure policy rule", + rules: []jobset.FailurePolicyRule{rule2, rule1}, // failedJob1 is the first failedJob1 but does not match rule2 which is the first failure policy rule to be matched failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, @@ -267,7 +248,7 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - actualRule, actualJob := findFirstFailedPolicyRuleAndJob(context.TODO(), tc.js, tc.failedOwnedJobs) + actualRule, actualJob := findFirstFailedPolicyRuleAndJob(context.TODO(), tc.rules, tc.failedOwnedJobs) if diff := cmp.Diff(tc.expectedJob, actualJob); diff != "" { t.Errorf("unexpected finished value (+got/-want): %s", diff) } From 497a7c7f57ae44b833c2d94970adcb39e33431d8 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 6 May 2024 18:46:51 +0000 Subject: [PATCH 03/25] Move functions related to failure policy to failure_policy.go file. --- pkg/controllers/failure_policy.go | 67 +++++++++++++++++++++++ pkg/controllers/failure_policy_test.go | 51 +++++++++++++++++ pkg/controllers/jobset_controller.go | 67 ----------------------- pkg/controllers/jobset_controller_test.go | 50 ----------------- 4 files changed, 118 insertions(+), 117 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index a231faf0..878cc301 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -72,6 +72,58 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi return nil } +// makeFailedConditionOpts returns the options we use to generate the JobSet failed condition. +func makeFailedConditionOpts(reason, msg string) *conditionOpts { + return &conditionOpts{ + condition: &metav1.Condition{ + Type: string(jobset.JobSetFailed), + Status: metav1.ConditionStatus(corev1.ConditionTrue), + Reason: reason, + Message: msg, + }, + eventType: corev1.EventTypeWarning, + } +} + +// setJobSetFailedCondition sets a condition on the JobSet status indicating it has failed. +func setJobSetFailedCondition(ctx context.Context, js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) { + setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts) +} + +// findJobFailureTime is a helper function which extracts the Job failure time from a Job, +// if the JobFailed condition exists and is true. +func findJobFailureTime(job *batchv1.Job) *metav1.Time { + failureCondition := findJobFailureCondition(job) + if failureCondition == nil { + return nil + } + return &failureCondition.LastTransitionTime +} + +// findFirstFailedJob accepts a slice of failed Jobs and returns the Job which has a JobFailed condition +// with the oldest transition time. +func findFirstFailedJob(failedJobs []*batchv1.Job) *batchv1.Job { + var ( + firstFailedJob *batchv1.Job + firstFailureTime *metav1.Time + ) + for _, job := range failedJobs { + failureTime := findJobFailureTime(job) + // If job has actually failed and it is the first (or only) failure we've seen, + // store the job for output. + if failureTime != nil && (firstFailedJob == nil || failureTime.Before(firstFailureTime)) { + firstFailedJob = job + firstFailureTime = failureTime + } + } + return firstFailedJob +} + +// messageWithFirstFailedJob appends the first failed job to the original event message in human readable way. +func messageWithFirstFailedJob(msg, firstFailedJobName string) string { + return fmt.Sprintf("%s (first failed job: %s)", msg, firstFailedJobName) +} + // ruleIsApplicable returns true if the failed job and job failure reason match the failure policy rule. // The function returns false otherwise. func ruleIsApplicable(rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { @@ -92,6 +144,21 @@ func ruleIsApplicable(rule jobset.FailurePolicyRule, failedJob *batchv1.Job, job return ruleAppliesToParentReplicatedJob } +// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, +// if the JobFailed condition exists and is true. +func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { + if job == nil { + return nil + } + for _, c := range job.Status.Conditions { + // If this Job failed before the oldest known Job failiure, update the first failed job. + if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { + return &c + } + } + return nil +} + // findFirstFailedPolicyRuleAndJob returns the first failure policy rule matching a failed child job. // The function also returns the first child job matching the failure policy rule returned. // If there does not exist a matching failure policy rule, then the function returns nil for all values. diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go index 23189340..c66c2f31 100644 --- a/pkg/controllers/failure_policy_test.go +++ b/pkg/controllers/failure_policy_test.go @@ -19,13 +19,64 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" testutils "sigs.k8s.io/jobset/pkg/util/testing" ) +func TestFindFirstFailedJob(t *testing.T) { + testCases := []struct { + name string + failedJobs []*batchv1.Job + expected *batchv1.Job + }{ + { + name: "No failed jobs", + failedJobs: []*batchv1.Job{}, + expected: nil, + }, + { + name: "Single failed job", + failedJobs: []*batchv1.Job{ + jobWithFailedCondition("job1", time.Now().Add(-1*time.Hour)), + }, + expected: jobWithFailedCondition("job1", time.Now().Add(-1*time.Hour)), + }, + { + name: "Multiple failed jobs, earliest first", + failedJobs: []*batchv1.Job{ + jobWithFailedCondition("job1", time.Now().Add(-3*time.Hour)), + jobWithFailedCondition("job2", time.Now().Add(-5*time.Hour)), + }, + expected: jobWithFailedCondition("job2", time.Now().Add(-5*time.Hour)), + }, + { + name: "Jobs without failed condition", + failedJobs: []*batchv1.Job{ + {ObjectMeta: metav1.ObjectMeta{Name: "job1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "job2"}}, + }, + expected: nil, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + result := findFirstFailedJob(tc.failedJobs) + if result != nil && tc.expected != nil { + assert.Equal(t, result.Name, tc.expected.Name) + } else if result != nil && tc.expected == nil || result == nil && tc.expected != nil { + t.Errorf("Expected: %v, got: %v)", result, tc.expected) + } + }) + } +} + func TestFailurePolicyRuleIsApplicable(t *testing.T) { var ( replicatedJobName1 = "test-replicated-job-1" diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go index 3ff68433..3fb1bb73 100644 --- a/pkg/controllers/jobset_controller.go +++ b/pkg/controllers/jobset_controller.go @@ -822,55 +822,6 @@ func findReplicatedJobStatus(replicatedJobStatus []jobset.ReplicatedJobStatus, r return jobset.ReplicatedJobStatus{} } -// messageWithFirstFailedJob appends the first failed job to the original event message in human readable way. -func messageWithFirstFailedJob(msg, firstFailedJobName string) string { - return fmt.Sprintf("%s (first failed job: %s)", msg, firstFailedJobName) -} - -// findFirstFailedJob accepts a slice of failed Jobs and returns the Job which has a JobFailed condition -// with the oldest transition time. -func findFirstFailedJob(failedJobs []*batchv1.Job) *batchv1.Job { - var ( - firstFailedJob *batchv1.Job - firstFailureTime *metav1.Time - ) - for _, job := range failedJobs { - failureTime := findJobFailureTime(job) - // If job has actually failed and it is the first (or only) failure we've seen, - // store the job for output. - if failureTime != nil && (firstFailedJob == nil || failureTime.Before(firstFailureTime)) { - firstFailedJob = job - firstFailureTime = failureTime - } - } - return firstFailedJob -} - -// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, -// if the JobFailed condition exists and is true. -func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { - if job == nil { - return nil - } - for _, c := range job.Status.Conditions { - // If this Job failed before the oldest known Job failiure, update the first failed job. - if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { - return &c - } - } - return nil -} - -// findJobFailureTime is a helper function which extracts the Job failure time from a Job, -// if the JobFailed condition exists and is true. -func findJobFailureTime(job *batchv1.Job) *metav1.Time { - failureCondition := findJobFailureCondition(job) - if failureCondition == nil { - return nil - } - return &failureCondition.LastTransitionTime -} - // managedByExternalController returns a pointer to the name of the external controller managing // the JobSet, if one exists. Otherwise, it returns nil. func managedByExternalController(js *jobset.JobSet) *string { @@ -971,11 +922,6 @@ func setJobSetCompletedCondition(js *jobset.JobSet, updateStatusOpts *statusUpda setCondition(js, makeCompletedConditionsOpts(), updateStatusOpts) } -// setJobSetFailedCondition sets a condition on the JobSet status indicating it has failed. -func setJobSetFailedCondition(ctx context.Context, js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) { - setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts) -} - // setJobSetSuspendedCondition sets a condition on the JobSet status indicating it is currently suspended. func setJobSetSuspendedCondition(js *jobset.JobSet, updateStatusOpts *statusUpdateOpts) { setCondition(js, makeSuspendedConditionOpts(), updateStatusOpts) @@ -1000,19 +946,6 @@ func makeCompletedConditionsOpts() *conditionOpts { } } -// makeFailedConditionOpts returns the options we use to generate the JobSet failed condition. -func makeFailedConditionOpts(reason, msg string) *conditionOpts { - return &conditionOpts{ - condition: &metav1.Condition{ - Type: string(jobset.JobSetFailed), - Status: metav1.ConditionStatus(corev1.ConditionTrue), - Reason: reason, - Message: msg, - }, - eventType: corev1.EventTypeWarning, - } -} - // makeSuspendedConditionOpts returns the options we use to generate the JobSet suspended condition. func makeSuspendedConditionOpts() *conditionOpts { return &conditionOpts{ diff --git a/pkg/controllers/jobset_controller_test.go b/pkg/controllers/jobset_controller_test.go index c4bc3f5c..6901fc66 100644 --- a/pkg/controllers/jobset_controller_test.go +++ b/pkg/controllers/jobset_controller_test.go @@ -33,7 +33,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "github.com/stretchr/testify/assert" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -1107,55 +1106,6 @@ func TestCalculateReplicatedJobStatuses(t *testing.T) { } } -func TestFindFirstFailedJob(t *testing.T) { - testCases := []struct { - name string - failedJobs []*batchv1.Job - expected *batchv1.Job - }{ - { - name: "No failed jobs", - failedJobs: []*batchv1.Job{}, - expected: nil, - }, - { - name: "Single failed job", - failedJobs: []*batchv1.Job{ - jobWithFailedCondition("job1", time.Now().Add(-1*time.Hour)), - }, - expected: jobWithFailedCondition("job1", time.Now().Add(-1*time.Hour)), - }, - { - name: "Multiple failed jobs, earliest first", - failedJobs: []*batchv1.Job{ - jobWithFailedCondition("job1", time.Now().Add(-3*time.Hour)), - jobWithFailedCondition("job2", time.Now().Add(-5*time.Hour)), - }, - expected: jobWithFailedCondition("job2", time.Now().Add(-5*time.Hour)), - }, - { - name: "Jobs without failed condition", - failedJobs: []*batchv1.Job{ - {ObjectMeta: metav1.ObjectMeta{Name: "job1"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "job2"}}, - }, - expected: nil, - }, - } - - for _, tc := range testCases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - result := findFirstFailedJob(tc.failedJobs) - if result != nil && tc.expected != nil { - assert.Equal(t, result.Name, tc.expected.Name) - } else if result != nil && tc.expected == nil || result == nil && tc.expected != nil { - t.Errorf("Expected: %v, got: %v)", result, tc.expected) - } - }) - } -} - // Helper function to create a job object with a failed condition func jobWithFailedCondition(name string, failureTime time.Time) *batchv1.Job { return jobWithFailedConditionAndOpts(name, failureTime, nil) From 9b5c2522e302ee0b2c9a32f8f2f74236ac1f3832 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 00:07:40 +0000 Subject: [PATCH 04/25] Add name property to failure policy rules. --- api/jobset/v1alpha2/jobset_types.go | 4 + api/jobset/v1alpha2/openapi_generated.go | 10 +- .../jobset/v1alpha2/failurepolicyrule.go | 9 + .../crd/bases/jobset.x-k8s.io_jobsets.yaml | 7 + hack/python-sdk/swagger.json | 6 + pkg/webhooks/jobset_webhook.go | 87 +++++- pkg/webhooks/jobset_webhook_test.go | 272 ++++++++++++++++++ .../docs/JobsetV1alpha2FailurePolicyRule.md | 1 + .../jobset_v1alpha2_failure_policy_rule.py | 31 +- .../test_jobset_v1alpha2_failure_policy.py | 1 + ...est_jobset_v1alpha2_failure_policy_rule.py | 2 + .../test/test_jobset_v1alpha2_job_set.py | 1 + .../test/test_jobset_v1alpha2_job_set_list.py | 2 + .../test/test_jobset_v1alpha2_job_set_spec.py | 1 + 14 files changed, 417 insertions(+), 17 deletions(-) diff --git a/api/jobset/v1alpha2/jobset_types.go b/api/jobset/v1alpha2/jobset_types.go index 54733cb1..3ba52f7f 100644 --- a/api/jobset/v1alpha2/jobset_types.go +++ b/api/jobset/v1alpha2/jobset_types.go @@ -252,6 +252,10 @@ const ( // FailurePolicyRule defines a FailurePolicyAction to be executed if a child job // fails due to a reason listed in OnJobFailureReasons. type FailurePolicyRule struct { + // The name of the failure policy rule. + // The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. + // The name must match the regular expression "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$". + Name string `json:"name"` // The action to take if the rule is matched. // +kubebuilder:validation:Enum:=FailJobSet;RestartJobSet;RestartJobSetAndIgnoreMaxRestarts Action FailurePolicyAction `json:"action"` diff --git a/api/jobset/v1alpha2/openapi_generated.go b/api/jobset/v1alpha2/openapi_generated.go index e4953b34..55cc4b6b 100644 --- a/api/jobset/v1alpha2/openapi_generated.go +++ b/api/jobset/v1alpha2/openapi_generated.go @@ -80,6 +80,14 @@ func schema_jobset_api_jobset_v1alpha2_FailurePolicyRule(ref common.ReferenceCal Description: "FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fails due to a reason listed in OnJobFailureReasons.", Type: []string{"object"}, Properties: map[string]spec.Schema{ + "name": { + SchemaProps: spec.SchemaProps{ + Description: "The name of the failure policy rule. The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. The name must match the regular expression \"^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$\".", + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, "action": { SchemaProps: spec.SchemaProps{ Description: "The action to take if the rule is matched.", @@ -124,7 +132,7 @@ func schema_jobset_api_jobset_v1alpha2_FailurePolicyRule(ref common.ReferenceCal }, }, }, - Required: []string{"action", "onJobFailureReasons"}, + Required: []string{"name", "action", "onJobFailureReasons"}, }, }, } diff --git a/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go index 9bec63bf..870a6c06 100644 --- a/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go +++ b/client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go @@ -21,6 +21,7 @@ import ( // FailurePolicyRuleApplyConfiguration represents an declarative configuration of the FailurePolicyRule type for use // with apply. type FailurePolicyRuleApplyConfiguration struct { + Name *string `json:"name,omitempty"` Action *v1alpha2.FailurePolicyAction `json:"action,omitempty"` OnJobFailureReasons []string `json:"onJobFailureReasons,omitempty"` TargetReplicatedJobs []string `json:"targetReplicatedJobs,omitempty"` @@ -32,6 +33,14 @@ func FailurePolicyRule() *FailurePolicyRuleApplyConfiguration { return &FailurePolicyRuleApplyConfiguration{} } +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *FailurePolicyRuleApplyConfiguration) WithName(value string) *FailurePolicyRuleApplyConfiguration { + b.Name = &value + return b +} + // WithAction sets the Action field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Action field is set to the value of the last call. diff --git a/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml b/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml index c9bc1b30..ebd17bc1 100644 --- a/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml +++ b/config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml @@ -86,6 +86,12 @@ spec: - RestartJobSet - RestartJobSetAndIgnoreMaxRestarts type: string + name: + description: |- + The name of the failure policy rule. + The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. + The name must match the regular expression "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$". + type: string onJobFailureReasons: description: |- The requirement on the job failure reasons. The requirement @@ -106,6 +112,7 @@ spec: x-kubernetes-list-type: atomic required: - action + - name - onJobFailureReasons type: object type: array diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 40d945ff..8df6b367 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -29,6 +29,7 @@ "description": "FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fails due to a reason listed in OnJobFailureReasons.", "type": "object", "required": [ + "name", "action", "onJobFailureReasons" ], @@ -38,6 +39,11 @@ "type": "string", "default": "" }, + "name": { + "description": "The name of the failure policy rule. The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. The name must match the regular expression \"^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$\".", + "type": "string", + "default": "" + }, "onJobFailureReasons": { "description": "The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason.", "type": "array", diff --git a/pkg/webhooks/jobset_webhook.go b/pkg/webhooks/jobset_webhook.go index edef50df..55a56d8c 100644 --- a/pkg/webhooks/jobset_webhook.go +++ b/pkg/webhooks/jobset_webhook.go @@ -18,6 +18,7 @@ import ( "errors" "fmt" "math" + "regexp" "strconv" "strings" @@ -98,6 +99,9 @@ func (j *jobSetWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { Complete() } +const defaultRuleNameFmt = "failurePolicyRule%v" + +// Default performs defaulting of jobset values as defined in the JobSet API. func (j *jobSetWebhook) Default(ctx context.Context, obj runtime.Object) error { js, ok := obj.(*jobset.JobSet) if !ok { @@ -132,11 +136,77 @@ func (j *jobSetWebhook) Default(ctx context.Context, obj runtime.Object) error { js.Spec.Network.PublishNotReadyAddresses = ptr.To(true) } + // Apply the default failure policy rule name policy. + if js.Spec.FailurePolicy != nil { + for i := range js.Spec.FailurePolicy.Rules { + rule := &js.Spec.FailurePolicy.Rules[i] + if len(rule.Name) == 0 { + rule.Name = fmt.Sprintf(defaultRuleNameFmt, i) + } + } + } + return nil } //+kubebuilder:webhook:path=/validate-jobset-x-k8s-io-v1alpha2-jobset,mutating=false,failurePolicy=fail,sideEffects=None,groups=jobset.x-k8s.io,resources=jobsets,verbs=create;update,versions=v1alpha2,name=vjobset.kb.io,admissionReviewVersions=v1 +const minRuleNameLength = 1 +const maxRuleNameLength = 128 +const ruleNameFmt = "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$" + +var ruleNameRegexp = regexp.MustCompile(ruleNameFmt) + +// validateFailurePolicy performs validation for jobset failure policies and returns all errors detected. +func validateFailurePolicy(failurePolicy *jobset.FailurePolicy, validReplicatedJobs []string) []error { + var allErrs []error + if failurePolicy == nil { + return allErrs + } + + // ruleNameToRulesWithName is used to verify that rule names are unique + ruleNameToRulesWithName := make(map[string]([]int)) + for index, rule := range failurePolicy.Rules { + // Check that the rule name meets the minimum length + nameLen := len(rule.Name) + if !(minRuleNameLength <= nameLen && nameLen <= maxRuleNameLength) { + err := fmt.Errorf("invalid failure policy rule name of length %v, the rule name must be at least %v characters long and at most %v characters long", nameLen, minRuleNameLength, maxRuleNameLength) + allErrs = append(allErrs, err) + } + + ruleNameToRulesWithName[rule.Name] = append(ruleNameToRulesWithName[rule.Name], index) + + if !ruleNameRegexp.MatchString(rule.Name) { + err := fmt.Errorf("invalid failure policy rule name '%v', a failure policy rule name must start with an alphabetic character, optionally followed by a string of alphanumeric characters or '_,:', and must end with an alphanumeric character or '_'", rule.Name) + allErrs = append(allErrs, err) + } + + // Validate the rules target replicated jobs are valid + for _, rjobName := range rule.TargetReplicatedJobs { + if !collections.Contains(validReplicatedJobs, rjobName) { + allErrs = append(allErrs, fmt.Errorf("invalid replicatedJob name '%s' in failure policy does not appear in .spec.ReplicatedJobs", rjobName)) + } + } + + // Validate the rules on job failure reasons are valid + for _, failureReason := range rule.OnJobFailureReasons { + if !collections.Contains(validOnJobFailureReasons, failureReason) { + allErrs = append(allErrs, fmt.Errorf("invalid job failure reason '%s' in failure policy is not a recognized job failure reason", failureReason)) + } + } + } + + // Checking that rule names are unique + for ruleName, rulesWithName := range ruleNameToRulesWithName { + if len(rulesWithName) > 1 { + err := fmt.Errorf("rule names are not unique, rules with indices %v all have the same name '%v'", rulesWithName, ruleName) + allErrs = append(allErrs, err) + } + } + + return allErrs +} + // ValidateCreate implements webhook.Validator so a webhook will be registered for the type func (j *jobSetWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { js, ok := obj.(*jobset.JobSet) @@ -221,21 +291,8 @@ func (j *jobSetWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) // Validate failure policy if js.Spec.FailurePolicy != nil { - for _, rule := range js.Spec.FailurePolicy.Rules { - // Validate the rules target replicated jobs are valid - for _, rjobName := range rule.TargetReplicatedJobs { - if !collections.Contains(validReplicatedJobs, rjobName) { - allErrs = append(allErrs, fmt.Errorf("invalid replicatedJob name '%s' in failure policy does not appear in .spec.ReplicatedJobs", rjobName)) - } - } - - // Validate the rules on job failure reasons are valid - for _, failureReason := range rule.OnJobFailureReasons { - if !collections.Contains(validOnJobFailureReasons, failureReason) { - allErrs = append(allErrs, fmt.Errorf("invalid job failure reason '%s' in failure policy is not a recognized job failure reason", failureReason)) - } - } - } + failurePolicyErrors := validateFailurePolicy(js.Spec.FailurePolicy, validReplicatedJobs) + allErrs = append(allErrs, failurePolicyErrors...) } return nil, errors.Join(allErrs...) } diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index 9068bf90..77dbd0d1 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -35,6 +35,9 @@ var TestPodTemplate = corev1.PodTemplateSpec{ }, } +// Question: Should we seperate these tests into different files as jobset_webhook_test.go is getting very long? +// One possible method of separation is to seperate by the value being defaulted. +// It would also be nice to reduce the amount of boiler plate repeated in test cases. func TestJobSetDefaulting(t *testing.T) { defaultSuccessPolicy := &jobset.SuccessPolicy{Operator: jobset.OperatorAll} defaultStartupPolicy := &jobset.StartupPolicy{StartupPolicyOrder: jobset.AnyOrder} @@ -593,6 +596,98 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + { + name: "failure policy rule name is defaulted when: there is one rule and it does not have a name", + js: &jobset.JobSet{ + Spec: jobset.JobSetSpec{ + SuccessPolicy: defaultSuccessPolicy, + Network: defaultNetwork, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + Template: TestPodTemplate, + CompletionMode: completionModePtr(batchv1.IndexedCompletion), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: make([]jobset.FailurePolicyRule, 1), + }, + }, + }, + want: &jobset.JobSet{ + Spec: jobset.JobSetSpec{ + SuccessPolicy: defaultSuccessPolicy, + StartupPolicy: defaultStartupPolicy, + Network: defaultNetwork, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + Template: TestPodTemplate, + CompletionMode: completionModePtr(batchv1.IndexedCompletion), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "failurePolicyRule0"}, + }, + }, + }, + }, + }, + { + name: "failure policy rule name is defaulted when: there are two rules, the first rule has a name, the second rule does not have a name", + js: &jobset.JobSet{ + Spec: jobset.JobSetSpec{ + SuccessPolicy: defaultSuccessPolicy, + Network: defaultNetwork, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + Template: TestPodTemplate, + CompletionMode: completionModePtr(batchv1.IndexedCompletion), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "ruleWithAName"}, + {}, + }, + }, + }, + }, + want: &jobset.JobSet{ + Spec: jobset.JobSetSpec{ + SuccessPolicy: defaultSuccessPolicy, + StartupPolicy: defaultStartupPolicy, + Network: defaultNetwork, + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + Template: TestPodTemplate, + CompletionMode: completionModePtr(batchv1.IndexedCompletion), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "ruleWithAName"}, + {Name: "failurePolicyRule1"}, + }, + }, + }, + }, + }, } fakeClient := fake.NewFakeClient() webhook, err := NewJobSetWebhook(fakeClient) @@ -1037,6 +1132,183 @@ func TestValidateCreate(t *testing.T) { fmt.Errorf("invalid replicatedJob name '%s' in failure policy does not appear in .spec.ReplicatedJobs", "fakeReplicatedJob"), ), }, + { + name: "jobset failure policy rule name is 0 characters long a.k.a unset", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: make([]jobset.FailurePolicyRule, 1), + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid failure policy rule name of length %v, the rule name must be at least %v characters long and at most %v characters long", 0, minRuleNameLength, maxRuleNameLength), + ), + }, + { + name: "jobset failure policy rule name is greater than 128 characters long", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: strings.Repeat("a", 129)}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid failure policy rule name of length %v, the rule name must be at least %v characters long and at most %v characters long", 129, minRuleNameLength, maxRuleNameLength), + ), + }, + { + name: "there are two failure policy rules with the same name", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "repeatedRuleName"}, + {Name: "repeatedRuleName"}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("rule names are not unique, rules with indices %v all have the same name '%v'", []int{0, 1}, "repeatedRuleName"), + ), + }, + { + name: "failure policy rule name does not start with an alphabetic character", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "1ruleToRuleThemAll"}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid failure policy rule name '%v', a failure policy rule name must start with an alphabetic character, optionally followed by a string of alphanumeric characters or '_,:', and must end with an alphanumeric character or '_'", "1ruleToRuleThemAll"), + ), + }, + { + name: "failure policy rule name does not end with an alphanumeric nor '_'", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "ruleToRuleThemAll,"}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join( + fmt.Errorf("invalid failure policy rule name '%v', a failure policy rule name must start with an alphabetic character, optionally followed by a string of alphanumeric characters or '_,:', and must end with an alphanumeric character or '_'", "ruleToRuleThemAll,"), + ), + }, + { + name: "failure policy rule name is valid", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "superAwesomeFailurePolicyRule"}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join(), + }, } fakeClient := fake.NewFakeClient() webhook, err := NewJobSetWebhook(fakeClient) diff --git a/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md b/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md index 1b98164a..4f123b36 100644 --- a/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md +++ b/sdk/python/docs/JobsetV1alpha2FailurePolicyRule.md @@ -5,6 +5,7 @@ FailurePolicyRule defines a FailurePolicyAction to be executed if a child job fa Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- **action** | **str** | The action to take if the rule is matched. | [default to ''] +**name** | **str** | The name of the failure policy rule. The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. The name must match the regular expression \"^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$\". | [default to ''] **on_job_failure_reasons** | **list[str]** | The requirement on the job failure reasons. The requirement is satisfied if at least one reason matches the list. The rules are evaluated in order, and the first matching rule is executed. An empty list applies the rule to any job failure reason. | **target_replicated_jobs** | **list[str]** | TargetReplicatedJobs are the names of the replicated jobs the operator applies to. An empty list will apply to all replicatedJobs. | [optional] diff --git a/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py index 3d5871d4..5c1054bb 100644 --- a/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py +++ b/sdk/python/jobset/models/jobset_v1alpha2_failure_policy_rule.py @@ -34,28 +34,32 @@ class JobsetV1alpha2FailurePolicyRule(object): """ openapi_types = { 'action': 'str', + 'name': 'str', 'on_job_failure_reasons': 'list[str]', 'target_replicated_jobs': 'list[str]' } attribute_map = { 'action': 'action', + 'name': 'name', 'on_job_failure_reasons': 'onJobFailureReasons', 'target_replicated_jobs': 'targetReplicatedJobs' } - def __init__(self, action='', on_job_failure_reasons=None, target_replicated_jobs=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, action='', name='', on_job_failure_reasons=None, target_replicated_jobs=None, local_vars_configuration=None): # noqa: E501 """JobsetV1alpha2FailurePolicyRule - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() self.local_vars_configuration = local_vars_configuration self._action = None + self._name = None self._on_job_failure_reasons = None self._target_replicated_jobs = None self.discriminator = None self.action = action + self.name = name self.on_job_failure_reasons = on_job_failure_reasons if target_replicated_jobs is not None: self.target_replicated_jobs = target_replicated_jobs @@ -85,6 +89,31 @@ def action(self, action): self._action = action + @property + def name(self): + """Gets the name of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + + The name of the failure policy rule. The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. The name must match the regular expression \"^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$\". # noqa: E501 + + :return: The name of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :rtype: str + """ + return self._name + + @name.setter + def name(self, name): + """Sets the name of this JobsetV1alpha2FailurePolicyRule. + + The name of the failure policy rule. The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. The name must match the regular expression \"^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$\". # noqa: E501 + + :param name: The name of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 + :type: str + """ + if self.local_vars_configuration.client_side_validation and name is None: # noqa: E501 + raise ValueError("Invalid value for `name`, must not be `None`") # noqa: E501 + + self._name = name + @property def on_job_failure_reasons(self): """Gets the on_job_failure_reasons of this JobsetV1alpha2FailurePolicyRule. # noqa: E501 diff --git a/sdk/python/test/test_jobset_v1alpha2_failure_policy.py b/sdk/python/test/test_jobset_v1alpha2_failure_policy.py index b0498b70..72e21671 100644 --- a/sdk/python/test/test_jobset_v1alpha2_failure_policy.py +++ b/sdk/python/test/test_jobset_v1alpha2_failure_policy.py @@ -42,6 +42,7 @@ def make_instance(self, include_optional): rules = [ jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], diff --git a/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py b/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py index f5740fc7..b20b6e47 100644 --- a/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py +++ b/sdk/python/test/test_jobset_v1alpha2_failure_policy_rule.py @@ -39,6 +39,7 @@ def make_instance(self, include_optional): if include_optional : return JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], @@ -49,6 +50,7 @@ def make_instance(self, include_optional): else : return JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set.py b/sdk/python/test/test_jobset_v1alpha2_job_set.py index 80bb1478..34f58a0e 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set.py @@ -47,6 +47,7 @@ def make_instance(self, include_optional): rules = [ jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set_list.py b/sdk/python/test/test_jobset_v1alpha2_job_set_list.py index d1be43fe..dad4a1d9 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set_list.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set_list.py @@ -50,6 +50,7 @@ def make_instance(self, include_optional): rules = [ jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], @@ -106,6 +107,7 @@ def make_instance(self, include_optional): rules = [ jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], diff --git a/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py b/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py index 4010e5fd..b000d6bd 100644 --- a/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py +++ b/sdk/python/test/test_jobset_v1alpha2_job_set_spec.py @@ -43,6 +43,7 @@ def make_instance(self, include_optional): rules = [ jobset.models.jobset_v1alpha2_failure_policy_rule.JobsetV1alpha2FailurePolicyRule( action = '0', + name = '0', on_job_failure_reasons = [ '0' ], From 3ea78cfaab91fb647dfe429aeda92b9054bd0352 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 00:25:27 +0000 Subject: [PATCH 05/25] Add log statement in ruleIsApplicable when parent replicatedJob is not found. --- pkg/controllers/failure_policy.go | 9 +++++---- pkg/controllers/failure_policy_test.go | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 878cc301..71a88059 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -126,7 +126,9 @@ func messageWithFirstFailedJob(msg, firstFailedJobName string) string { // ruleIsApplicable returns true if the failed job and job failure reason match the failure policy rule. // The function returns false otherwise. -func ruleIsApplicable(rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { +func ruleIsApplicable(ctx context.Context, rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { + log := ctrl.LoggerFrom(ctx) + ruleAppliesToJobFailureReason := len(rule.OnJobFailureReasons) == 0 || slices.Contains(rule.OnJobFailureReasons, jobFailureReason) if !ruleAppliesToJobFailureReason { return false @@ -135,8 +137,7 @@ func ruleIsApplicable(rule jobset.FailurePolicyRule, failedJob *batchv1.Job, job parentReplicatedJob, exists := parentReplicatedJobName(failedJob) if !exists { // If we cannot find the parent ReplicatedJob, we assume the rule does not apply. - // TODO: Add a log statement that the failedJob does not appear to have a parent replicated job. - // This error should not happen, but was a pain to debug when adding unit tests. + log.V(2).Info("The failed job %v does not appear to have a parent replicatedJob.", failedJob.Name) return false } @@ -177,7 +178,7 @@ func findFirstFailedPolicyRuleAndJob(ctx context.Context, rules []jobset.Failure jobFailureTime, jobFailureReason := ptr.To(jobFailureCondition.LastTransitionTime), jobFailureCondition.Reason jobFailedEarlier := matchedFailedJob == nil || jobFailureTime.Before(matchedFailureTime) - if ruleIsApplicable(rule, failedJob, jobFailureReason) && jobFailedEarlier { + if ruleIsApplicable(ctx, rule, failedJob, jobFailureReason) && jobFailedEarlier { matchedFailedJob = failedJob matchedFailureTime = jobFailureTime } diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go index c66c2f31..251acc82 100644 --- a/pkg/controllers/failure_policy_test.go +++ b/pkg/controllers/failure_policy_test.go @@ -166,7 +166,7 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - actual := ruleIsApplicable(tc.rule, tc.failedJob, tc.jobFailureReason) + actual := ruleIsApplicable(context.TODO(), tc.rule, tc.failedJob, tc.jobFailureReason) if diff := cmp.Diff(tc.expected, actual); diff != "" { t.Errorf("unexpected finished value (+got/-want): %s", diff) } From dcd462b2ea4974b56030ea5f31586007da4efc2d Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 04:08:11 +0000 Subject: [PATCH 06/25] Add definitions for the fields of eventParams. --- pkg/controllers/jobset_controller.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go index 3fb1bb73..e0e31343 100644 --- a/pkg/controllers/jobset_controller.go +++ b/pkg/controllers/jobset_controller.go @@ -75,9 +75,13 @@ type statusUpdateOpts struct { // eventParams contains parameters used for emitting a Kubernetes event. type eventParams struct { - object runtime.Object - eventType string - eventReason string + // object is the object that caused the event or is for some other reason associated with the event. + object runtime.Object + // eventType is a machine interpretable CamelCase string like "PanicTypeFooBar" describing the type of event. + eventType string + // eventReason is a machine interpretable CamelCase string like "FailureReasonFooBar" describing the reason for the event. + eventReason string + // eventMessage is a human interpretable sentence with details about the event. eventMessage string } From 92a3e8b6891440f2ec44cc08f7e08dfe257a7b03 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 18:25:59 +0000 Subject: [PATCH 07/25] Implement event reasons and messages for each failure policy action. --- pkg/constants/constants.go | 12 ++++++++++ pkg/controllers/failure_policy.go | 38 +++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 63ff1e1c..9eeb4e1f 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -75,4 +75,16 @@ const ( // Event reason and message related to resuming a JobSet. JobSetResumedReason = "ResumeJobs" JobSetResumedMessage = "jobset is resumed" + + // Event reason and message related to applying the FailJobSet failure policy action. + FailJobSetActionReason = "FailJobSetFailurePolicyAction" + FailJobSetActionMessage = "applying FailJobSet failure policy action" + + // Event reason and message related to applying the RestartJobSet failure policy action. + RestartJobSetActionReason = "RestartJobSetFailurePolicyAction" + RestartJobSetActionMessage = "applying RestartJobSet failure policy action" + + // Event reason and message related to applying the RestartJobSetAndIgnoreMaxRestarts failure policy action. + RestartJobSetAndIgnoreMaxRestartsActionReason = "RestartJobSetAndIgnoreMaxRestartsFailurePolicyActionReason" + RestartJobSetAndIgnoreMaxRestartsActionMessage = "applying RestartJobSetAndIgnoreMaxRestarts failure policy action" ) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 71a88059..645c2b08 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -235,37 +235,51 @@ type failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, m // failJobSetActionApplier applies the FailJobSet FailurePolicyAction var failJobSetActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { - failureMessage := messageWithFirstFailedJob(constants.ReachedMaxRestartsMessage, matchingFailedJob.Name) - setJobSetFailedCondition(ctx, js, constants.ReachedMaxRestartsReason, failureMessage, updateStatusOpts) + failureBaseMessage := constants.FailJobSetActionMessage + failureMessage := messageWithFirstFailedJob(failureBaseMessage, matchingFailedJob.Name) + + failureReason := constants.FailJobSetActionReason + setJobSetFailedCondition(ctx, js, failureReason, failureMessage, updateStatusOpts) return nil } // restartJobSetActionApplier applies the RestartJobSet FailurePolicyAction var restartJobSetActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { if js.Status.RestartsCountTowardsMax >= js.Spec.FailurePolicy.MaxRestarts { - failureMessage := messageWithFirstFailedJob(constants.ReachedMaxRestartsMessage, matchingFailedJob.Name) - setJobSetFailedCondition(ctx, js, constants.ReachedMaxRestartsReason, failureMessage, updateStatusOpts) + failureBaseMessage := constants.ReachedMaxRestartsMessage + failureMessage := messageWithFirstFailedJob(failureBaseMessage, matchingFailedJob.Name) + + failureReason := constants.ReachedMaxRestartsReason + setJobSetFailedCondition(ctx, js, failureReason, failureMessage, updateStatusOpts) return nil } - shouldCountTowardsMax := true + baseMessage := constants.RestartJobSetActionMessage + eventMessage := messageWithFirstFailedJob(baseMessage, matchingFailedJob.Name) event := &eventParams{ - object: js, - eventType: corev1.EventTypeWarning, - eventReason: fmt.Sprintf("restarting jobset, attempt %d", js.Status.Restarts), + object: js, + eventType: corev1.EventTypeWarning, + eventReason: constants.RestartJobSetActionReason, + eventMessage: eventMessage, } + + shouldCountTowardsMax := true failurePolicyRecreateAll(ctx, js, shouldCountTowardsMax, updateStatusOpts, event) return nil } // restartJobSetAndIgnoreMaxRestartsActionApplier applies the RestartJobSetAndIgnoreMaxRestarts FailurePolicyAction var restartJobSetAndIgnoreMaxRestartsActionApplier failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error { - shouldCountTowardsMax := false + baseMessage := constants.RestartJobSetAndIgnoreMaxRestartsActionMessage + eventMessage := messageWithFirstFailedJob(baseMessage, matchingFailedJob.Name) event := &eventParams{ - object: js, - eventType: corev1.EventTypeWarning, - eventReason: fmt.Sprintf("restarting jobset without counting towards maximum restarts, attempt %d", js.Status.Restarts), + object: js, + eventType: corev1.EventTypeWarning, + eventReason: constants.RestartJobSetAndIgnoreMaxRestartsActionReason, + eventMessage: eventMessage, } + + shouldCountTowardsMax := false failurePolicyRecreateAll(ctx, js, shouldCountTowardsMax, updateStatusOpts, event) return nil } From 898493f6a5a4808b2dc3529d21f82d890290c38b Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 20:45:17 +0000 Subject: [PATCH 08/25] Make the default failure policy rule action more clear. --- pkg/controllers/failure_policy.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 645c2b08..5d98ae47 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -35,6 +35,9 @@ var actionFunctionMap = map[jobset.FailurePolicyAction]failurePolicyActionApplie jobset.RestartJobSetAndIgnoreMaxRestarts: restartJobSetAndIgnoreMaxRestartsActionApplier, } +// The source of truth for the definition of defaultFailurePolicyRuleAction is the Configurable Failure Policy KEP. +const defaultFailurePolicyRuleAction = jobset.RestartJobSet + // executeFailurePolicy applies the Failure Policy of a JobSet when a failed child Job is found. // This function is run only when a failed child job has already been found. func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *childJobs, updateStatusOpts *statusUpdateOpts) error { @@ -58,7 +61,7 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi var failurePolicyRuleAction jobset.FailurePolicyAction if failurePolicyRule == nil { - failurePolicyRuleAction = jobset.RestartJobSet + failurePolicyRuleAction = defaultFailurePolicyRuleAction matchingFailedJob = findFirstFailedJob(ownedJobs.failed) } else { failurePolicyRuleAction = failurePolicyRule.Action From e866397d71a5bafc38c03b6326efdd3e8696d690 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 22:33:34 +0000 Subject: [PATCH 09/25] Refactor TestFailurePolicyRuleIsApplicable to improve standardization. --- pkg/controllers/failure_policy_test.go | 90 +++++++++++++++++++------- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go index 251acc82..e2863bdb 100644 --- a/pkg/controllers/failure_policy_test.go +++ b/pkg/controllers/failure_policy_test.go @@ -81,8 +81,12 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { var ( replicatedJobName1 = "test-replicated-job-1" replicatedJobName2 = "test-replicated-job-2" - jobName = "test-job" - ns = "default" + + jobFailureReason1 = batchv1.JobReasonBackoffLimitExceeded + jobFailureReason2 = batchv1.JobReasonDeadlineExceeded + + jobName = "test-job" + ns = "default" ) tests := []struct { @@ -93,74 +97,112 @@ func TestFailurePolicyRuleIsApplicable(t *testing.T) { expected bool }{ { - name: "failure policy rule matches on job failure reason", + name: "a job has failed and the failure policy rule matches all possible parent replicated jobs and matches all possible job failure reasons", + rule: jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{}, + TargetReplicatedJobs: []string{}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: jobFailureReason1, + expected: true, + }, + { + name: "a job has failed and the failure policy rule matches all possible parent replicated jobs and matches the job failure reason", + rule: jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{jobFailureReason1}, + TargetReplicatedJobs: []string{}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: jobFailureReason1, + expected: true, + }, + { + name: "a job has failed and the failure policy rule matches all possible parent replicated jobs and does not match the job failure reason", rule: jobset.FailurePolicyRule{ - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{jobFailureReason1}, + TargetReplicatedJobs: []string{}, + }, + failedJob: testutils.MakeJob(jobName, ns).JobLabels( + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + ).Obj(), + jobFailureReason: jobFailureReason2, + expected: false, + }, + { + name: "a job has failed and the failure policy rule matches the parent replicatedJob and matches all possible job failure reasons", + rule: jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{}, TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, ).Obj(), - jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, + jobFailureReason: jobFailureReason1, expected: true, }, { - name: "failure policy rule matches all on job failure reason", + name: "a job has failed and the failure policy rule matches the parent replicatedJob and matches the job failure reason", rule: jobset.FailurePolicyRule{ + OnJobFailureReasons: []string{jobFailureReason1}, TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, ).Obj(), - jobFailureReason: batchv1.JobReasonMaxFailedIndexesExceeded, + jobFailureReason: jobFailureReason1, expected: true, }, { - name: "failure policy rule does not match on job failure reason", + name: "a job has failed and the failure policy rule matches the parent replicatedJob and does not match the job failure reason", rule: jobset.FailurePolicyRule{ - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{jobFailureReason1}, TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, ).Obj(), - jobFailureReason: batchv1.JobReasonDeadlineExceeded, + jobFailureReason: jobFailureReason2, expected: false, }, { - name: "failure policy rule is not applicable to parent replicatedJob of failed job", + name: "a job has failed and the failure policy rule does not match the parent replicatedJob and matches all possible job failure reasons", rule: jobset.FailurePolicyRule{ - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{}, TargetReplicatedJobs: []string{replicatedJobName1}, }, - jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, failedJob: testutils.MakeJob(jobName, ns).JobLabels( map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName2}, ).Obj(), - expected: false, + jobFailureReason: jobFailureReason1, + expected: false, }, { - name: "failure policy rule is applicable to all replicatedjobs when targetedReplicatedJobs is omitted", + name: "a job has failed and the failure policy rule does not match the parent replicatedJob and matches the job failure reason", rule: jobset.FailurePolicyRule{ - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{jobFailureReason1}, + TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( - map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName2}, ).Obj(), - jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, - expected: true, + jobFailureReason: jobFailureReason1, + expected: false, }, { - name: "failure policy rule is applicable to parent replicatedJob when targetedReplicatedJobs is specified", + name: "a job has failed and the failure policy rule does not match the parent replicatedJob and does not match the job failure reason", rule: jobset.FailurePolicyRule{ - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{jobFailureReason1}, TargetReplicatedJobs: []string{replicatedJobName1}, }, failedJob: testutils.MakeJob(jobName, ns).JobLabels( - map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName1}, + map[string]string{jobset.ReplicatedJobNameKey: replicatedJobName2}, ).Obj(), - jobFailureReason: batchv1.JobReasonBackoffLimitExceeded, - expected: true, + jobFailureReason: jobFailureReason2, + expected: false, }, } From ebac8e1786116126053c335d9fe6695aa9480b07 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 22:45:42 +0000 Subject: [PATCH 10/25] Remove outdated comment. --- pkg/controllers/failure_policy.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 5d98ae47..c7534ac8 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -155,7 +155,6 @@ func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { return nil } for _, c := range job.Status.Conditions { - // If this Job failed before the oldest known Job failiure, update the first failed job. if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { return &c } From 9af5c065940f39a4b32cd27e45532456ee633a81 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Wed, 8 May 2024 23:09:52 +0000 Subject: [PATCH 11/25] Create function parseFailJobOpts to remove closure in jobWithFailedConditionAndOpts. --- pkg/controllers/jobset_controller_test.go | 32 ++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/controllers/jobset_controller_test.go b/pkg/controllers/jobset_controller_test.go index 6901fc66..a978b55c 100644 --- a/pkg/controllers/jobset_controller_test.go +++ b/pkg/controllers/jobset_controller_test.go @@ -1116,24 +1116,26 @@ type failJobOptions struct { parentReplicatedJobName *string } -// Helper function to create a job object with a failed condition -func jobWithFailedConditionAndOpts(name string, failureTime time.Time, opts *failJobOptions) *batchv1.Job { - var reason string - labels := make(map[string]string) - applyOpts := func() { - if opts == nil { - return - } +func parseFailJobOpts(opts *failJobOptions) (reason string, labels map[string]string) { + if opts == nil { + return "", nil + } - if opts.reason != nil { - reason = *opts.reason - } + if opts.parentReplicatedJobName != nil { + labels = make(map[string]string) + labels[jobset.ReplicatedJobNameKey] = *opts.parentReplicatedJobName + } - if opts.parentReplicatedJobName != nil { - labels[jobset.ReplicatedJobNameKey] = *opts.parentReplicatedJobName - } + if opts.reason != nil { + reason = *opts.reason } - applyOpts() + + return reason, labels +} + +// Helper function to create a job object with a failed condition +func jobWithFailedConditionAndOpts(name string, failureTime time.Time, opts *failJobOptions) *batchv1.Job { + reason, labels := parseFailJobOpts(opts) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: name, Labels: labels}, From ac8c3ea9c66487bb900e41da214bdee509b29e01 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Thu, 9 May 2024 17:59:03 +0000 Subject: [PATCH 12/25] Refactor TestJobSetDefaulting. --- pkg/webhooks/jobset_webhook_test.go | 54 ++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index 77dbd0d1..e7252fed 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -35,18 +35,18 @@ var TestPodTemplate = corev1.PodTemplateSpec{ }, } -// Question: Should we seperate these tests into different files as jobset_webhook_test.go is getting very long? -// One possible method of separation is to seperate by the value being defaulted. -// It would also be nice to reduce the amount of boiler plate repeated in test cases. +type jobSetDefaultingTestCase struct { + name string + js *jobset.JobSet + want *jobset.JobSet +} + func TestJobSetDefaulting(t *testing.T) { defaultSuccessPolicy := &jobset.SuccessPolicy{Operator: jobset.OperatorAll} defaultStartupPolicy := &jobset.StartupPolicy{StartupPolicyOrder: jobset.AnyOrder} defaultNetwork := &jobset.Network{EnableDNSHostnames: ptr.To(true), PublishNotReadyAddresses: ptr.To(true)} - testCases := []struct { - name string - js *jobset.JobSet - want *jobset.JobSet - }{ + + jobCompletionTests := []jobSetDefaultingTestCase{ { name: "job completion mode is unset", js: &jobset.JobSet{ @@ -123,6 +123,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + enablingDNSHostnameTests := []jobSetDefaultingTestCase{ { name: "enableDNSHostnames is unset", js: &jobset.JobSet{ @@ -200,6 +203,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + publishNotReadyNetworkAddresessTests := []jobSetDefaultingTestCase{ { name: "PublishNotReadyNetworkAddresess is false", js: &jobset.JobSet{ @@ -278,6 +284,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + podRestartTests := []jobSetDefaultingTestCase{ { name: "pod restart policy unset", js: &jobset.JobSet{ @@ -370,6 +379,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + successPolicyTests := []jobSetDefaultingTestCase{ { name: "success policy unset", js: &jobset.JobSet{ @@ -467,6 +479,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + startupPolicyTests := []jobSetDefaultingTestCase{ { name: "startup policy order InOrder set", js: &jobset.JobSet{ @@ -522,6 +537,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + managedByTests := []jobSetDefaultingTestCase{ { name: "managedBy field is left nil", js: &jobset.JobSet{ @@ -596,6 +614,9 @@ func TestJobSetDefaulting(t *testing.T) { }, }, }, + } + + failurePolicyRuleNameTests := []jobSetDefaultingTestCase{ { name: "failure policy rule name is defaulted when: there is one rule and it does not have a name", js: &jobset.JobSet{ @@ -689,6 +710,23 @@ func TestJobSetDefaulting(t *testing.T) { }, }, } + + testGroups := [][]jobSetDefaultingTestCase{ + jobCompletionTests, + enablingDNSHostnameTests, + publishNotReadyNetworkAddresessTests, + podRestartTests, + successPolicyTests, + startupPolicyTests, + startupPolicyTests, + managedByTests, + failurePolicyRuleNameTests, + } + var testCases []jobSetDefaultingTestCase + for _, testGroup := range testGroups { + testCases = append(testCases, testGroup...) + } + fakeClient := fake.NewFakeClient() webhook, err := NewJobSetWebhook(fakeClient) if err != nil { From e91b6b0d00de9a4518b3aee71bb4e2354c20757e Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Thu, 9 May 2024 18:13:21 +0000 Subject: [PATCH 13/25] Refactor TestValidateCreate. --- pkg/webhooks/jobset_webhook_test.go | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index e7252fed..cb7b2a28 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -745,6 +745,12 @@ func TestJobSetDefaulting(t *testing.T) { } } +type validationTestCase struct { + name string + js *jobset.JobSet + want error +} + func TestValidateCreate(t *testing.T) { managedByFieldPath := field.NewPath("spec", "managedBy") @@ -773,11 +779,7 @@ func TestValidateCreate(t *testing.T) { }, } - testCases := []struct { - name string - js *jobset.JobSet - want error - }{ + uncategorizedTests := []validationTestCase{ { name: "number of pods exceeds the limit", js: &jobset.JobSet{ @@ -1003,6 +1005,9 @@ func TestValidateCreate(t *testing.T) { fmt.Errorf(podNameTooLongErrorMsg), ), }, + } + + jobsetControllerNameTests := []validationTestCase{ { name: "jobset controller name is not a domain-prefixed path", js: &jobset.JobSet{ @@ -1102,6 +1107,9 @@ func TestValidateCreate(t *testing.T) { }, want: errors.Join(), }, + } + + failurePolicyTests := []validationTestCase{ { name: "jobset failure policy has an invalid on job failure reason", js: &jobset.JobSet{ @@ -1348,6 +1356,17 @@ func TestValidateCreate(t *testing.T) { want: errors.Join(), }, } + + testGroups := [][]validationTestCase{ + uncategorizedTests, + jobsetControllerNameTests, + failurePolicyTests, + } + var testCases []validationTestCase + for _, testGroup := range testGroups { + testCases = append(testCases, testGroup...) + } + fakeClient := fake.NewFakeClient() webhook, err := NewJobSetWebhook(fakeClient) if err != nil { From 4de124b35fecf6d54ebedfe48f0eb9f3e02d19d5 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Thu, 9 May 2024 18:22:18 +0000 Subject: [PATCH 14/25] Add logging when a matching failure policy rule is found. --- pkg/controllers/failure_policy.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index c7534ac8..e049468d 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -168,7 +168,7 @@ func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { func findFirstFailedPolicyRuleAndJob(ctx context.Context, rules []jobset.FailurePolicyRule, failedOwnedJobs []*batchv1.Job) (*jobset.FailurePolicyRule, *batchv1.Job) { log := ctrl.LoggerFrom(ctx) - for _, rule := range rules { + for index, rule := range rules { var matchedFailedJob *batchv1.Job var matchedFailureTime *metav1.Time for _, failedJob := range failedOwnedJobs { @@ -187,8 +187,10 @@ func findFirstFailedPolicyRuleAndJob(ctx context.Context, rules []jobset.Failure } if matchedFailedJob != nil { + log.V(2).Info("found a failed job matching failure policy rule with index %v and name %v", index, rule.Name) return &rule, matchedFailedJob } + log.V(2).Info("did not find a failed job matching failure policy rule with index %v and name %v", index, rule.Name) } log.V(2).Info("never found a matched failure policy rule.") From c5ae410a254da9be6e39b22ac7d8c4e898f49f57 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Thu, 9 May 2024 19:22:29 +0000 Subject: [PATCH 15/25] In TestValidateCreate, move the test for a valid failure policy rule to be the first failure policy rule test. --- pkg/webhooks/jobset_webhook_test.go | 56 ++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index cb7b2a28..1170f8f1 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -1110,6 +1110,34 @@ func TestValidateCreate(t *testing.T) { } failurePolicyTests := []validationTestCase{ + { + name: "failure policy rule name is valid", + js: &jobset.JobSet{ + ObjectMeta: validObjectMeta, + Spec: jobset.JobSetSpec{ + ReplicatedJobs: []jobset.ReplicatedJob{ + { + Name: "rj", + Replicas: 1, + Template: batchv1.JobTemplateSpec{ + Spec: batchv1.JobSpec{ + CompletionMode: ptr.To(batchv1.IndexedCompletion), + Completions: ptr.To(int32(1)), + Parallelism: ptr.To(int32(1)), + }, + }, + }, + }, + FailurePolicy: &jobset.FailurePolicy{ + Rules: []jobset.FailurePolicyRule{ + {Name: "superAwesomeFailurePolicyRule"}, + }, + }, + SuccessPolicy: &jobset.SuccessPolicy{}, + }, + }, + want: errors.Join(), + }, { name: "jobset failure policy has an invalid on job failure reason", js: &jobset.JobSet{ @@ -1327,34 +1355,6 @@ func TestValidateCreate(t *testing.T) { fmt.Errorf("invalid failure policy rule name '%v', a failure policy rule name must start with an alphabetic character, optionally followed by a string of alphanumeric characters or '_,:', and must end with an alphanumeric character or '_'", "ruleToRuleThemAll,"), ), }, - { - name: "failure policy rule name is valid", - js: &jobset.JobSet{ - ObjectMeta: validObjectMeta, - Spec: jobset.JobSetSpec{ - ReplicatedJobs: []jobset.ReplicatedJob{ - { - Name: "rj", - Replicas: 1, - Template: batchv1.JobTemplateSpec{ - Spec: batchv1.JobSpec{ - CompletionMode: ptr.To(batchv1.IndexedCompletion), - Completions: ptr.To(int32(1)), - Parallelism: ptr.To(int32(1)), - }, - }, - }, - }, - FailurePolicy: &jobset.FailurePolicy{ - Rules: []jobset.FailurePolicyRule{ - {Name: "superAwesomeFailurePolicyRule"}, - }, - }, - SuccessPolicy: &jobset.SuccessPolicy{}, - }, - }, - want: errors.Join(), - }, } testGroups := [][]validationTestCase{ From 9f0d08bda86d6b6445c0462e8b4a1c5de9037272 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Thu, 9 May 2024 19:42:14 +0000 Subject: [PATCH 16/25] Add comment defining TestValidateCreate. --- pkg/webhooks/jobset_webhook_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/webhooks/jobset_webhook_test.go b/pkg/webhooks/jobset_webhook_test.go index 1170f8f1..4ca173c5 100644 --- a/pkg/webhooks/jobset_webhook_test.go +++ b/pkg/webhooks/jobset_webhook_test.go @@ -751,6 +751,12 @@ type validationTestCase struct { want error } +// TestValidateCreate tests the ValidateCreate method of the jobset webhook. +// Each test case specifies a list of expected errors. +// For each test case, the function TestValidateCreate checks that each +// expected error is present in the list of errors returned by +// ValidateCreate. It is okay if ValidateCreate returns errors +// beyond the expected errors. func TestValidateCreate(t *testing.T) { managedByFieldPath := field.NewPath("spec", "managedBy") From 5b8f55d9b8960f96b3332953eee8e423c634fa1a Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Tue, 14 May 2024 18:45:15 +0000 Subject: [PATCH 17/25] Refactor TestFindFirstFailedPolicyRuleAndJob to make individual test case names more clear. --- pkg/controllers/failure_policy_test.go | 126 ++++++++++++++----------- 1 file changed, 73 insertions(+), 53 deletions(-) diff --git a/pkg/controllers/failure_policy_test.go b/pkg/controllers/failure_policy_test.go index e2863bdb..c928231c 100644 --- a/pkg/controllers/failure_policy_test.go +++ b/pkg/controllers/failure_policy_test.go @@ -220,50 +220,54 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { var ( replicatedJobName = "test-replicatedJob" - failedJobNoReason1 = jobWithFailedCondition("job1", time.Now().Add(-6*time.Hour)) - failedJobNoReason2 = jobWithFailedCondition("job2", time.Now().Add(-3*time.Hour)) - failedJobNoReason3 = jobWithFailedCondition("job2", time.Now().Add(-1*time.Hour)) + failureReason1 = batchv1.JobReasonBackoffLimitExceeded + failureReason2 = batchv1.JobReasonDeadlineExceeded + failureReason3 = batchv1.JobReasonFailedIndexes failedJob1 = jobWithFailedConditionAndOpts("job1", time.Now().Add(-6*time.Hour), &failJobOptions{ - reason: ptr.To(batchv1.JobReasonBackoffLimitExceeded), + reason: ptr.To(failureReason1), + parentReplicatedJobName: ptr.To(replicatedJobName), + }, + ) + failedJob1DupEarlierFailure = jobWithFailedConditionAndOpts("job1DupEarlierFailure", time.Now().Add(-12*time.Hour), + &failJobOptions{ + reason: ptr.To(failureReason1), parentReplicatedJobName: ptr.To(replicatedJobName), }, ) failedJob2 = jobWithFailedConditionAndOpts("job2", time.Now().Add(-3*time.Hour), &failJobOptions{ - reason: ptr.To(batchv1.JobReasonDeadlineExceeded), + reason: ptr.To(failureReason2), parentReplicatedJobName: ptr.To(replicatedJobName), }, ) failedJob3 = jobWithFailedConditionAndOpts("job3", time.Now().Add(-1*time.Hour), &failJobOptions{ - reason: ptr.To(batchv1.JobReasonFailedIndexes), + reason: ptr.To(failureReason3), parentReplicatedJobName: ptr.To(replicatedJobName), }, ) - // ruleN matches failedJobN - rule1 = jobset.FailurePolicyRule{ + // failurePolicyRuleN matches failedJobN + failurePolicyRule1 = jobset.FailurePolicyRule{ Action: jobset.RestartJobSet, - OnJobFailureReasons: []string{batchv1.JobReasonBackoffLimitExceeded}, + OnJobFailureReasons: []string{failureReason1}, } - rule2 = jobset.FailurePolicyRule{ + failurePolicyRule2 = jobset.FailurePolicyRule{ Action: jobset.RestartJobSet, - OnJobFailureReasons: []string{batchv1.JobReasonDeadlineExceeded}, + OnJobFailureReasons: []string{failureReason2}, } - - unmatchedRule = jobset.FailurePolicyRule{ + failurePolicyRule3 = jobset.FailurePolicyRule{ Action: jobset.RestartJobSet, - OnJobFailureReasons: []string{batchv1.JobReasonMaxFailedIndexesExceeded, batchv1.JobReasonPodFailurePolicy}, + OnJobFailureReasons: []string{failureReason3}, } - extraFailedJob = jobWithFailedConditionAndOpts("extra-job1", time.Now().Add(3*time.Hour), - &failJobOptions{ - reason: ptr.To(batchv1.JobReasonDeadlineExceeded), - parentReplicatedJobName: ptr.To(replicatedJobName), - }, - ) + fakeReplicatedJobName = "fakeReplicatedJobName" + unmatchedFailurePolicyRule = jobset.FailurePolicyRule{ + Action: jobset.RestartJobSet, + TargetReplicatedJobs: []string{fakeReplicatedJobName}, + } ) tests := []struct { name string @@ -274,68 +278,84 @@ func TestFindFirstFailedPolicyRuleAndJob(t *testing.T) { expectedJob *batchv1.Job }{ { - name: "failure policy rules are empty with no failed jobs", + name: "There are 0 failed jobs and there are 0 failure policy rules, therefore nil is returned for both the rule and job.", + rules: []jobset.FailurePolicyRule{}, failedOwnedJobs: []*batchv1.Job{}, expectedFailurePolicyRule: nil, expectedJob: nil, }, { - name: "failure policy rules are empty with one failed job", - failedOwnedJobs: []*batchv1.Job{ - failedJobNoReason1, - }, + name: "There is 1 failed job and there are 0 failure policy rules, therefore nil is returned for both the rule and job.", + rules: []jobset.FailurePolicyRule{}, + failedOwnedJobs: []*batchv1.Job{failedJob1}, expectedFailurePolicyRule: nil, expectedJob: nil, }, { - name: "failure policy rules are empty with multiple failed jobs", - failedOwnedJobs: []*batchv1.Job{failedJobNoReason3, failedJobNoReason1, failedJobNoReason2}, + name: "There is 1 failed job, there is 1 failure policy rule, and the failure policy rule does not match the job, therefore nil is returned for both the rule and job.", + rules: []jobset.FailurePolicyRule{failurePolicyRule2}, + failedOwnedJobs: []*batchv1.Job{failedJob1}, expectedFailurePolicyRule: nil, expectedJob: nil, }, { - name: "failure policy rule does not match on job failure reasons", - rules: []jobset.FailurePolicyRule{unmatchedRule}, - failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + name: "There is 1 failed job, there are 3 failure policy rules, and the failed job matches the first failure policy rule, therefore the job and first failure policy rule are returned.", + rules: []jobset.FailurePolicyRule{failurePolicyRule1, failurePolicyRule2, failurePolicyRule3}, + failedOwnedJobs: []*batchv1.Job{failedJob1}, - expectedFailurePolicyRule: nil, - expectedJob: nil, + expectedFailurePolicyRule: &failurePolicyRule1, + expectedJob: failedJob1, }, { - name: "failure policy rule matches first job to fail out of all jobs", - rules: []jobset.FailurePolicyRule{rule1}, - failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + name: "There is 1 failed job, there are 3 failure policy rules, the failed job does not match the first failure policy rule, and the failed job matches the second failure policy rule, therefore the job and second failure policy rule are returned.", + rules: []jobset.FailurePolicyRule{failurePolicyRule1, failurePolicyRule2, failurePolicyRule3}, + failedOwnedJobs: []*batchv1.Job{failedJob2}, - expectedFailurePolicyRule: &rule1, - expectedJob: failedJob1, + expectedFailurePolicyRule: &failurePolicyRule2, + expectedJob: failedJob2, }, { - name: "failure policy rule matches second job to fail out of all jobs", - rules: []jobset.FailurePolicyRule{rule2}, - failedOwnedJobs: []*batchv1.Job{failedJob3, failedJob1, failedJob2}, + name: "There is 1 failed job, there are 3 failure policy rules, the failed job does not match the first nor second failure policy rules, and the failed job matches the third failure policy rule, therefore the job and the the third failure policy rule are returned.", + rules: []jobset.FailurePolicyRule{failurePolicyRule1, failurePolicyRule2, failurePolicyRule3}, + failedOwnedJobs: []*batchv1.Job{failedJob3}, - expectedFailurePolicyRule: &rule2, - expectedJob: failedJob2, + expectedFailurePolicyRule: &failurePolicyRule3, + expectedJob: failedJob3, }, { - name: "failure policy rule matches multiple jobs and first failed job is the last one", - rules: []jobset.FailurePolicyRule{rule2}, - failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, + name: "There are 2 failed jobs and there are 0 failure policy rules, therefore nil is returned for both the rule and job.", + rules: []jobset.FailurePolicyRule{}, + failedOwnedJobs: []*batchv1.Job{failedJob1, failedJob2}, - expectedFailurePolicyRule: &rule2, - expectedJob: failedJob2, + expectedFailurePolicyRule: nil, + expectedJob: nil, }, { - name: "first failed job that matches a failure policy rule is different from the first job to fail that matches the first matched failure policy rule", - rules: []jobset.FailurePolicyRule{rule2, rule1}, - // failedJob1 is the first failedJob1 but does not match rule2 which is the first failure policy rule to be matched - failedOwnedJobs: []*batchv1.Job{extraFailedJob, failedJob3, failedJob1, failedJob2}, + name: "There are 2 failed jobs, the second job failed before the first job, there is 1 failure policy rule, and the failure policy rule does not match any of the jobs, therefore nil is returned for both the rule and job.", + rules: []jobset.FailurePolicyRule{failurePolicyRule2}, + failedOwnedJobs: []*batchv1.Job{failedJob1, failedJob1DupEarlierFailure}, - expectedFailurePolicyRule: &rule2, - expectedJob: failedJob2, + expectedFailurePolicyRule: nil, + expectedJob: nil, + }, + { + name: "There are 2 failed jobs, the second job failed before the first job, there is 1 failure policy rule, and the failure policy rule matches both jobs, therefore the second job and the failure policy rule are returned.", + rules: []jobset.FailurePolicyRule{failurePolicyRule1}, + failedOwnedJobs: []*batchv1.Job{failedJob1, failedJob1DupEarlierFailure}, + + expectedFailurePolicyRule: &failurePolicyRule1, + expectedJob: failedJob1DupEarlierFailure, + }, + { + name: "There are 2 failed jobs, the second job failed before the first job, there are 2 failure policy rules, the first failure policy does not match any of the jobs, and the second failure policy rule matches both jobs, therefore the second job and the second failure policy rule are returned.", + rules: []jobset.FailurePolicyRule{unmatchedFailurePolicyRule, failurePolicyRule1}, + failedOwnedJobs: []*batchv1.Job{failedJob1, failedJob1DupEarlierFailure}, + + expectedFailurePolicyRule: &failurePolicyRule1, + expectedJob: failedJob1DupEarlierFailure, }, } From 0bdffeb15247a35a79b321c0dc141d597085ab23 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Fri, 17 May 2024 19:18:09 +0000 Subject: [PATCH 18/25] Update jobset controller integration tests related to failure policies. I added `[failure policy]` to the begin of the name of each test related to failure policies so that it is easier to select only those tests to run. I also updated tests to check that `RestartsCountTowardsMax` is incrementing only when expected. --- .../controller/jobset_controller_test.go | 98 +++++++++++++++++-- 1 file changed, 88 insertions(+), 10 deletions(-) diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go index ddf73701..7d48aca7 100644 --- a/test/integration/controller/jobset_controller_test.go +++ b/test/integration/controller/jobset_controller_test.go @@ -365,7 +365,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), - ginkgo.Entry("jobset with no failure policy should fail if any jobs fail", &testCase{ + ginkgo.Entry("[failure policy] jobset with no failure policy should fail if any jobs fail", &testCase{ makeJobSet: testJobSet, updates: []*update{ { @@ -443,7 +443,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), - ginkgo.Entry("jobset fails immediately with FailJobSet failure policy action.", &testCase{ + ginkgo.Entry("[failure policy] jobset fails immediately with FailJobSet failure policy action.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -463,9 +463,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetFailed, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, }, }), - ginkgo.Entry("jobset does not fail immediately with FailJobSet failure policy action.", &testCase{ + ginkgo.Entry("[failure policy] jobset does not fail immediately with FailJobSet failure policy action as the rule is not matched.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -494,7 +499,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), - ginkgo.Entry("jobset restarts with RestartJobSet Failure Policy Action.", &testCase{ + ginkgo.Entry("[failure policy] jobset restarts with RestartJobSet failure policy action.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -518,9 +523,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, }, }), - ginkgo.Entry("jobset restarts with RestartJobSetAndIgnoremaxRestarts Failure Policy Action.", &testCase{ + ginkgo.Entry("[failure policy] jobset restarts with RestartJobSetAndIgnoremaxRestarts failure policy action.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -544,6 +554,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -557,6 +572,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -570,6 +590,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -579,7 +604,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, }, }), - ginkgo.Entry("job fails and the parent replicated job is contained in TargetReplicatedJobs.", &testCase{ + ginkgo.Entry("[failure policy] job fails and the parent replicated job is contained in TargetReplicatedJobs.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -600,9 +625,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetFailed, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, }, }), - ginkgo.Entry("job fails and the parent replicated job is not contained in TargetReplicatedJobs.", &testCase{ + ginkgo.Entry("[failure policy] job fails and the parent replicated job is not contained in TargetReplicatedJobs.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -633,9 +663,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { { checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, }, }), - ginkgo.Entry("failure policy rules order verification test 1", &testCase{ + ginkgo.Entry("[failure policy] failure policy rules order verification test 1", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -661,9 +696,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetFailed, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, }, }), - ginkgo.Entry("failure policy rules order verification test 2", &testCase{ + ginkgo.Entry("[failure policy] failure policy rules order verification test 2", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -696,9 +736,14 @@ var _ = ginkgo.Describe("JobSet controller", func() { removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) }, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, }, }), - ginkgo.Entry("failure policy rules order verification test 3", &testCase{ + ginkgo.Entry("[failure policy] failure policy rules order verification test 3", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -724,6 +769,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -737,6 +787,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -750,6 +805,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -763,6 +823,11 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetFailed, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestartsCountTowardsMax(js, 0) + }, + }, }, }), ginkgo.Entry("job succeeds after one failure", &testCase{ @@ -1949,6 +2014,19 @@ func jobActive(job *batchv1.Job) bool { return active } +// matchJobSetRestartsCountTowardsMax checks that the supplied jobset js has expectedMaxRestarts +// as the value of js.Status.RestartsCountTowardsMax. +func matchJobSetRestartsCountTowardsMax(js *jobset.JobSet, expectedMaxRestarts int32) { + gomega.Eventually(func() (int32, error) { + newJs := jobset.JobSet{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: js.Name, Namespace: js.Namespace}, &newJs); err != nil { + return 0, err + } + + return newJs.Status.RestartsCountTowardsMax, nil + }, timeout, interval).Should(gomega.BeComparableTo(expectedMaxRestarts)) +} + func matchJobSetReplicatedStatus(js *jobset.JobSet, expectedStatus []jobset.ReplicatedJobStatus) { gomega.Eventually(func() ([]jobset.ReplicatedJobStatus, error) { newJs := jobset.JobSet{} From a8354ff59a39cd20638ef34d776368d28098abb2 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Fri, 17 May 2024 20:18:07 +0000 Subject: [PATCH 19/25] Add check for value of .status.restarts field. --- .../controller/jobset_controller_test.go | 63 ++++++++++++++----- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go index 7d48aca7..92eb8b6d 100644 --- a/test/integration/controller/jobset_controller_test.go +++ b/test/integration/controller/jobset_controller_test.go @@ -465,6 +465,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 0) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -490,6 +491,12 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -525,12 +532,13 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) matchJobSetRestartsCountTowardsMax(js, 1) }, }, }, }), - ginkgo.Entry("[failure policy] jobset restarts with RestartJobSetAndIgnoremaxRestarts failure policy action.", &testCase{ + ginkgo.Entry("[failure policy] jobset restarts with RestartJobSetAndIgnoreMaxRestarts failure policy action.", &testCase{ makeJobSet: func(ns *corev1.Namespace) *testing.JobSetWrapper { return testJobSet(ns). FailurePolicy(&jobset.FailurePolicy{ @@ -556,6 +564,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -574,6 +583,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 2) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -592,6 +602,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 3) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -627,6 +638,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 0) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -653,6 +665,12 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -660,14 +678,6 @@ var _ = ginkgo.Describe("JobSet controller", func() { removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) }, }, - { - checkJobSetCondition: testutil.JobSetActive, - }, - { - checkJobSetState: func(js *jobset.JobSet) { - matchJobSetRestartsCountTowardsMax(js, 1) - }, - }, }, }), ginkgo.Entry("[failure policy] failure policy rules order verification test 1", &testCase{ @@ -698,6 +708,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 0) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -729,6 +740,12 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, checkJobSetCondition: testutil.JobSetActive, }, + { + checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) + matchJobSetRestartsCountTowardsMax(js, 1) + }, + }, { jobSetUpdateFn: func(js *jobset.JobSet) { // For a restart, all jobs will be deleted and recreated, so we expect a @@ -736,11 +753,6 @@ var _ = ginkgo.Describe("JobSet controller", func() { removeForegroundDeletionFinalizers(js, testutil.NumExpectedJobs(js)) }, }, - { - checkJobSetState: func(js *jobset.JobSet) { - matchJobSetRestartsCountTowardsMax(js, 1) - }, - }, }, }), ginkgo.Entry("[failure policy] failure policy rules order verification test 3", &testCase{ @@ -771,6 +783,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 1) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -789,6 +802,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 2) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -807,6 +821,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 3) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -825,6 +840,7 @@ var _ = ginkgo.Describe("JobSet controller", func() { }, { checkJobSetState: func(js *jobset.JobSet) { + matchJobSetRestarts(js, 3) matchJobSetRestartsCountTowardsMax(js, 0) }, }, @@ -2014,9 +2030,22 @@ func jobActive(job *batchv1.Job) bool { return active } -// matchJobSetRestartsCountTowardsMax checks that the supplied jobset js has expectedMaxRestarts +// matchJobSetRestarts checks that the supplied jobset js has expectedCount +// as the value of js.Status.Restarts. +func matchJobSetRestarts(js *jobset.JobSet, expectedCount int32) { + gomega.Eventually(func() (int32, error) { + newJs := jobset.JobSet{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: js.Name, Namespace: js.Namespace}, &newJs); err != nil { + return 0, err + } + + return newJs.Status.Restarts, nil + }, timeout, interval).Should(gomega.BeComparableTo(expectedCount)) +} + +// matchJobSetRestartsCountTowardsMax checks that the supplied jobset js has expectedCount // as the value of js.Status.RestartsCountTowardsMax. -func matchJobSetRestartsCountTowardsMax(js *jobset.JobSet, expectedMaxRestarts int32) { +func matchJobSetRestartsCountTowardsMax(js *jobset.JobSet, expectedCount int32) { gomega.Eventually(func() (int32, error) { newJs := jobset.JobSet{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: js.Name, Namespace: js.Namespace}, &newJs); err != nil { @@ -2024,7 +2053,7 @@ func matchJobSetRestartsCountTowardsMax(js *jobset.JobSet, expectedMaxRestarts i } return newJs.Status.RestartsCountTowardsMax, nil - }, timeout, interval).Should(gomega.BeComparableTo(expectedMaxRestarts)) + }, timeout, interval).Should(gomega.BeComparableTo(expectedCount)) } func matchJobSetReplicatedStatus(js *jobset.JobSet, expectedStatus []jobset.ReplicatedJobStatus) { From bf03055120d05a6e73bdc1ea70e5c66fb1f9d5e7 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 18:23:10 +0000 Subject: [PATCH 20/25] Remove 'Reason' from the end of RestartJobSetAndIgnoreMaxRestartsActionReason value. --- pkg/constants/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 9eeb4e1f..4d641ab7 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -85,6 +85,6 @@ const ( RestartJobSetActionMessage = "applying RestartJobSet failure policy action" // Event reason and message related to applying the RestartJobSetAndIgnoreMaxRestarts failure policy action. - RestartJobSetAndIgnoreMaxRestartsActionReason = "RestartJobSetAndIgnoreMaxRestartsFailurePolicyActionReason" + RestartJobSetAndIgnoreMaxRestartsActionReason = "RestartJobSetAndIgnoreMaxRestartsFailurePolicyAction" RestartJobSetAndIgnoreMaxRestartsActionMessage = "applying RestartJobSetAndIgnoreMaxRestarts failure policy action" ) From 715777a109cc636bb246804bd1b2721bf78b3893 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 18:29:13 +0000 Subject: [PATCH 21/25] Change the ordering of functions in failure_policy.go. --- pkg/controllers/failure_policy.go | 226 +++++++++++++++--------------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index e049468d..e0852401 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -75,93 +75,6 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi return nil } -// makeFailedConditionOpts returns the options we use to generate the JobSet failed condition. -func makeFailedConditionOpts(reason, msg string) *conditionOpts { - return &conditionOpts{ - condition: &metav1.Condition{ - Type: string(jobset.JobSetFailed), - Status: metav1.ConditionStatus(corev1.ConditionTrue), - Reason: reason, - Message: msg, - }, - eventType: corev1.EventTypeWarning, - } -} - -// setJobSetFailedCondition sets a condition on the JobSet status indicating it has failed. -func setJobSetFailedCondition(ctx context.Context, js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) { - setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts) -} - -// findJobFailureTime is a helper function which extracts the Job failure time from a Job, -// if the JobFailed condition exists and is true. -func findJobFailureTime(job *batchv1.Job) *metav1.Time { - failureCondition := findJobFailureCondition(job) - if failureCondition == nil { - return nil - } - return &failureCondition.LastTransitionTime -} - -// findFirstFailedJob accepts a slice of failed Jobs and returns the Job which has a JobFailed condition -// with the oldest transition time. -func findFirstFailedJob(failedJobs []*batchv1.Job) *batchv1.Job { - var ( - firstFailedJob *batchv1.Job - firstFailureTime *metav1.Time - ) - for _, job := range failedJobs { - failureTime := findJobFailureTime(job) - // If job has actually failed and it is the first (or only) failure we've seen, - // store the job for output. - if failureTime != nil && (firstFailedJob == nil || failureTime.Before(firstFailureTime)) { - firstFailedJob = job - firstFailureTime = failureTime - } - } - return firstFailedJob -} - -// messageWithFirstFailedJob appends the first failed job to the original event message in human readable way. -func messageWithFirstFailedJob(msg, firstFailedJobName string) string { - return fmt.Sprintf("%s (first failed job: %s)", msg, firstFailedJobName) -} - -// ruleIsApplicable returns true if the failed job and job failure reason match the failure policy rule. -// The function returns false otherwise. -func ruleIsApplicable(ctx context.Context, rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { - log := ctrl.LoggerFrom(ctx) - - ruleAppliesToJobFailureReason := len(rule.OnJobFailureReasons) == 0 || slices.Contains(rule.OnJobFailureReasons, jobFailureReason) - if !ruleAppliesToJobFailureReason { - return false - } - - parentReplicatedJob, exists := parentReplicatedJobName(failedJob) - if !exists { - // If we cannot find the parent ReplicatedJob, we assume the rule does not apply. - log.V(2).Info("The failed job %v does not appear to have a parent replicatedJob.", failedJob.Name) - return false - } - - ruleAppliesToParentReplicatedJob := len(rule.TargetReplicatedJobs) == 0 || slices.Contains(rule.TargetReplicatedJobs, parentReplicatedJob) - return ruleAppliesToParentReplicatedJob -} - -// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, -// if the JobFailed condition exists and is true. -func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { - if job == nil { - return nil - } - for _, c := range job.Status.Conditions { - if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { - return &c - } - } - return nil -} - // findFirstFailedPolicyRuleAndJob returns the first failure policy rule matching a failed child job. // The function also returns the first child job matching the failure policy rule returned. // If there does not exist a matching failure policy rule, then the function returns nil for all values. @@ -197,6 +110,47 @@ func findFirstFailedPolicyRuleAndJob(ctx context.Context, rules []jobset.Failure return nil, nil } +// applyFailurePolicyRuleAction applies the supplied FailurePolicyRuleAction. +func applyFailurePolicyRuleAction(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOps *statusUpdateOpts, failurePolicyRuleAction jobset.FailurePolicyAction) error { + log := ctrl.LoggerFrom(ctx) + + applier, ok := actionFunctionMap[failurePolicyRuleAction] + if !ok { + err := fmt.Errorf("failed to find a corresponding action for the FailurePolicyRuleAction %v", failurePolicyRuleAction) + log.Error(err, "retrieving information for FailurePolicyRuleAction") + return err + } + + err := applier(ctx, js, matchingFailedJob, updateStatusOps) + if err != nil { + log.Error(err, "error applying the FailurePolicyRuleAction: %v", failurePolicyRuleAction) + return err + } + + return nil +} + +// ruleIsApplicable returns true if the failed job and job failure reason match the failure policy rule. +// The function returns false otherwise. +func ruleIsApplicable(ctx context.Context, rule jobset.FailurePolicyRule, failedJob *batchv1.Job, jobFailureReason string) bool { + log := ctrl.LoggerFrom(ctx) + + ruleAppliesToJobFailureReason := len(rule.OnJobFailureReasons) == 0 || slices.Contains(rule.OnJobFailureReasons, jobFailureReason) + if !ruleAppliesToJobFailureReason { + return false + } + + parentReplicatedJob, exists := parentReplicatedJobName(failedJob) + if !exists { + // If we cannot find the parent ReplicatedJob, we assume the rule does not apply. + log.V(2).Info("The failed job %v does not appear to have a parent replicatedJob.", failedJob.Name) + return false + } + + ruleAppliesToParentReplicatedJob := len(rule.TargetReplicatedJobs) == 0 || slices.Contains(rule.TargetReplicatedJobs, parentReplicatedJob) + return ruleAppliesToParentReplicatedJob +} + // failurePolicyRecreateAll triggers a JobSet restart for the next reconcillation loop. func failurePolicyRecreateAll(ctx context.Context, js *jobset.JobSet, shouldCountTowardsMax bool, updateStatusOpts *statusUpdateOpts, event *eventParams) { log := ctrl.LoggerFrom(ctx) @@ -220,19 +174,6 @@ func failurePolicyRecreateAll(ctx context.Context, js *jobset.JobSet, shouldCoun log.V(2).Info("attempting restart", "restart attempt", js.Status.Restarts) } -// parentReplicatedJobName returns the name of the parent -// ReplicatedJob and true if it is able to retrieve the parent. -// The empty string and false are returned otherwise. -func parentReplicatedJobName(job *batchv1.Job) (string, bool) { - if job == nil { - return "", false - } - - replicatedJobName, ok := job.Labels[jobset.ReplicatedJobNameKey] - replicatedJobNameIsUnset := !ok || replicatedJobName == "" - return replicatedJobName, !replicatedJobNameIsUnset -} - // The type failurePolicyActionApplier applies a FailurePolicyAction and returns nil if the FailurePolicyAction was successfully applied. // The function returns an error otherwise. type failurePolicyActionApplier = func(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOpts *statusUpdateOpts) error @@ -288,22 +229,81 @@ var restartJobSetAndIgnoreMaxRestartsActionApplier failurePolicyActionApplier = return nil } -// applyFailurePolicyRuleAction applies the supplied FailurePolicyRuleAction. -func applyFailurePolicyRuleAction(ctx context.Context, js *jobset.JobSet, matchingFailedJob *batchv1.Job, updateStatusOps *statusUpdateOpts, failurePolicyRuleAction jobset.FailurePolicyAction) error { - log := ctrl.LoggerFrom(ctx) - - applier, ok := actionFunctionMap[failurePolicyRuleAction] - if !ok { - err := fmt.Errorf("failed to find a corresponding action for the FailurePolicyRuleAction %v", failurePolicyRuleAction) - log.Error(err, "retrieving information for FailurePolicyRuleAction") - return err +// parentReplicatedJobName returns the name of the parent +// ReplicatedJob and true if it is able to retrieve the parent. +// The empty string and false are returned otherwise. +func parentReplicatedJobName(job *batchv1.Job) (string, bool) { + if job == nil { + return "", false } - err := applier(ctx, js, matchingFailedJob, updateStatusOps) - if err != nil { - log.Error(err, "error applying the FailurePolicyRuleAction: %v", failurePolicyRuleAction) - return err + replicatedJobName, ok := job.Labels[jobset.ReplicatedJobNameKey] + replicatedJobNameIsUnset := !ok || replicatedJobName == "" + return replicatedJobName, !replicatedJobNameIsUnset +} + +// makeFailedConditionOpts returns the options we use to generate the JobSet failed condition. +func makeFailedConditionOpts(reason, msg string) *conditionOpts { + return &conditionOpts{ + condition: &metav1.Condition{ + Type: string(jobset.JobSetFailed), + Status: metav1.ConditionStatus(corev1.ConditionTrue), + Reason: reason, + Message: msg, + }, + eventType: corev1.EventTypeWarning, } +} + +// setJobSetFailedCondition sets a condition on the JobSet status indicating it has failed. +func setJobSetFailedCondition(ctx context.Context, js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) { + setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts) +} +// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, +// if the JobFailed condition exists and is true. +func findJobFailureCondition(job *batchv1.Job) *batchv1.JobCondition { + if job == nil { + return nil + } + for _, c := range job.Status.Conditions { + if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { + return &c + } + } return nil } + +// findJobFailureTime is a helper function which extracts the Job failure time from a Job, +// if the JobFailed condition exists and is true. +func findJobFailureTime(job *batchv1.Job) *metav1.Time { + failureCondition := findJobFailureCondition(job) + if failureCondition == nil { + return nil + } + return &failureCondition.LastTransitionTime +} + +// findFirstFailedJob accepts a slice of failed Jobs and returns the Job which has a JobFailed condition +// with the oldest transition time. +func findFirstFailedJob(failedJobs []*batchv1.Job) *batchv1.Job { + var ( + firstFailedJob *batchv1.Job + firstFailureTime *metav1.Time + ) + for _, job := range failedJobs { + failureTime := findJobFailureTime(job) + // If job has actually failed and it is the first (or only) failure we've seen, + // store the job for output. + if failureTime != nil && (firstFailedJob == nil || failureTime.Before(firstFailureTime)) { + firstFailedJob = job + firstFailureTime = failureTime + } + } + return firstFailedJob +} + +// messageWithFirstFailedJob appends the first failed job to the original event message in human readable way. +func messageWithFirstFailedJob(msg, firstFailedJobName string) string { + return fmt.Sprintf("%s (first failed job: %s)", msg, firstFailedJobName) +} From 3fee011926ca71e43181535faa6d5ecc0793cb5d Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 20:00:11 +0000 Subject: [PATCH 22/25] Remove parentheses around '[]int'. --- pkg/webhooks/jobset_webhook.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/webhooks/jobset_webhook.go b/pkg/webhooks/jobset_webhook.go index 55a56d8c..db116dd2 100644 --- a/pkg/webhooks/jobset_webhook.go +++ b/pkg/webhooks/jobset_webhook.go @@ -165,7 +165,7 @@ func validateFailurePolicy(failurePolicy *jobset.FailurePolicy, validReplicatedJ } // ruleNameToRulesWithName is used to verify that rule names are unique - ruleNameToRulesWithName := make(map[string]([]int)) + ruleNameToRulesWithName := make(map[string][]int) for index, rule := range failurePolicy.Rules { // Check that the rule name meets the minimum length nameLen := len(rule.Name) From 92fe2f77eee55ec9d0da1c5b6cb55c21c896f1b0 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 22:36:38 +0000 Subject: [PATCH 23/25] Rename 'failurePolicyRule' to 'matchingFailurePolicyRule'. --- pkg/controllers/failure_policy.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index e0852401..f9ed0b22 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -57,14 +57,14 @@ func executeFailurePolicy(ctx context.Context, js *jobset.JobSet, ownedJobs *chi // Check for matching Failure Policy Rule rules := js.Spec.FailurePolicy.Rules - failurePolicyRule, matchingFailedJob := findFirstFailedPolicyRuleAndJob(ctx, rules, ownedJobs.failed) + matchingFailurePolicyRule, matchingFailedJob := findFirstFailedPolicyRuleAndJob(ctx, rules, ownedJobs.failed) var failurePolicyRuleAction jobset.FailurePolicyAction - if failurePolicyRule == nil { + if matchingFailurePolicyRule == nil { failurePolicyRuleAction = defaultFailurePolicyRuleAction matchingFailedJob = findFirstFailedJob(ownedJobs.failed) } else { - failurePolicyRuleAction = failurePolicyRule.Action + failurePolicyRuleAction = matchingFailurePolicyRule.Action } if err := applyFailurePolicyRuleAction(ctx, js, matchingFailedJob, updateStatusOpts, failurePolicyRuleAction); err != nil { From 6bd209fee4d41cbf49d3ab3eecdcf8aa1b82cde1 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 22:38:55 +0000 Subject: [PATCH 24/25] Combine function function call and error check into one line. --- pkg/controllers/failure_policy.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index f9ed0b22..2c6369da 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -121,8 +121,7 @@ func applyFailurePolicyRuleAction(ctx context.Context, js *jobset.JobSet, matchi return err } - err := applier(ctx, js, matchingFailedJob, updateStatusOps) - if err != nil { + if err := applier(ctx, js, matchingFailedJob, updateStatusOps); err != nil { log.Error(err, "error applying the FailurePolicyRuleAction: %v", failurePolicyRuleAction) return err } From 6a1ac21326663d425e764d5b3811ede457b888e7 Mon Sep 17 00:00:00 2001 From: Justin Edwins Date: Mon, 20 May 2024 22:41:58 +0000 Subject: [PATCH 25/25] Change 'gomega.BeComparableTo' to 'gomega.Equal'. --- test/integration/controller/jobset_controller_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/controller/jobset_controller_test.go b/test/integration/controller/jobset_controller_test.go index 92eb8b6d..e6bfe80f 100644 --- a/test/integration/controller/jobset_controller_test.go +++ b/test/integration/controller/jobset_controller_test.go @@ -2053,7 +2053,7 @@ func matchJobSetRestartsCountTowardsMax(js *jobset.JobSet, expectedCount int32) } return newJs.Status.RestartsCountTowardsMax, nil - }, timeout, interval).Should(gomega.BeComparableTo(expectedCount)) + }, timeout, interval).Should(gomega.Equal(expectedCount)) } func matchJobSetReplicatedStatus(js *jobset.JobSet, expectedStatus []jobset.ReplicatedJobStatus) { @@ -2069,7 +2069,7 @@ func matchJobSetReplicatedStatus(js *jobset.JobSet, expectedStatus []jobset.Repl } sort.Slice(newJs.Status.ReplicatedJobsStatus, compareNames) return newJs.Status.ReplicatedJobsStatus, nil - }, timeout, interval).Should(gomega.BeComparableTo(expectedStatus)) + }, timeout, interval).Should(gomega.Equal(expectedStatus)) } // 2 replicated jobs: