Skip to content

Commit

Permalink
Implement configurable failure policy.
Browse files Browse the repository at this point in the history
  • Loading branch information
Justin Edwins committed Apr 26, 2024
1 parent 3ae60bb commit 72b6e44
Show file tree
Hide file tree
Showing 31 changed files with 1,731 additions and 53 deletions.
54 changes: 50 additions & 4 deletions api/jobset/v1alpha2/jobset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ import (
const (
JobSetNameKey string = "jobset.sigs.k8s.io/jobset-name"
ReplicatedJobReplicas string = "jobset.sigs.k8s.io/replicatedjob-replicas"
ReplicatedJobNameKey string = "jobset.sigs.k8s.io/replicatedjob-name"
JobIndexKey string = "jobset.sigs.k8s.io/job-index"
JobKey string = "jobset.sigs.k8s.io/job-key"
JobNameKey string = "job-name" // TODO(#26): Migrate to the fully qualified label name.
// ReplicatedJobNameKey is used to index into a Jobs labels and retrieve the name of the parent ReplicatedJob
ReplicatedJobNameKey string = "jobset.sigs.k8s.io/replicatedjob-name"
JobIndexKey string = "jobset.sigs.k8s.io/job-index"
JobKey string = "jobset.sigs.k8s.io/job-key"
JobNameKey string = "job-name" // TODO(#26): Migrate to the fully qualified label name.
// ExclusiveKey is an annotation that can be set on the JobSet or on a ReplicatedJob template.
// If set at the JobSet level, all child jobs from all ReplicatedJobs will be scheduled using exclusive
// job placement per topology group (defined as the label value).
Expand Down Expand Up @@ -119,6 +120,9 @@ type JobSetStatus struct {
// Restarts tracks the number of times the JobSet has restarted (i.e. recreated in case of RecreateAll policy).
Restarts int32 `json:"restarts,omitempty"`

// RestartsCountTowardsMax tracks the number of times the JobSet has restarted that counts towards the maximum allowed number of restarts.
RestartsCountTowardsMax int32 `json:"restartsCountTowardsMax,omitempty"`

// ReplicatedJobsStatus track the number of JobsReady for each replicatedJob.
// +optional
// +listType=map
Expand Down Expand Up @@ -218,10 +222,52 @@ const (
OperatorAny Operator = "Any"
)

// FailurePolicyAction defines the action the JobSet controller will take for
// a given FailurePolicyRule.
type FailurePolicyAction string

const (
// Fail the JobSet immediately, regardless of maxRestarts.
FailJobSet FailurePolicyAction = "FailJobSet"

// Restart the JobSet if the number of restart attempts is less than MaxRestarts.
// Otherwise, fail the JobSet.
RestartJobSet FailurePolicyAction = "RestartJobSet"

// Do not count the failure against maxRestarts.
RestartJobSetAndIgnoreMaxRestarts FailurePolicyAction = "RestartJobSetAndIgnoreMaxRestarts"
)

// FailurePolicyRule defines a FailurePolicyAction to be executed if a child job
// fails due to a reason listed in OnJobFailureReasons.
type FailurePolicyRule struct {
// The action to take if the rule is matched.
// +kubebuilder:validation:Enum:=FailJobSet;RestartJobSet;RestartJobSetAndIgnoreMaxRestarts
Action FailurePolicyAction `json:"action"`
// The requirement on the job failure reasons. The requirement
// is satisfied if at least one reason matches the list.
// The rules are evaluated in order, and the first matching
// rule is executed.
// An empty list applies the rule to any job failure reason.
// +kubebuilder:validation:UniqueItems:true
OnJobFailureReasons []string `json:"onJobFailureReasons"`
// TargetReplicatedJobs are the names of the replicated jobs the operator applies to.
// An empty list will apply to all replicatedJobs.
// +optional
// +listType=atomic
TargetReplicatedJobs []string `json:"targetReplicatedJobs,omitempty"`
}

type FailurePolicy struct {
// MaxRestarts defines the limit on the number of JobSet restarts.
// A restart is achieved by recreating all active child jobs.
MaxRestarts int32 `json:"maxRestarts,omitempty"`

// List of failure policy rules for this JobSet.
// For a given Job failure, the rules will be evaluated in order,
// and only the first matching rule will be executed.
// If no matching rule is found, the RestartJobSet action is applied.
Rules []FailurePolicyRule `json:"rules,omitempty"`
}

type SuccessPolicy struct {
Expand Down
81 changes: 81 additions & 0 deletions api/jobset/v1alpha2/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 33 additions & 1 deletion api/jobset/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 15 additions & 1 deletion client-go/applyconfiguration/jobset/v1alpha2/failurepolicy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 61 additions & 0 deletions client-go/applyconfiguration/jobset/v1alpha2/failurepolicyrule.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions client-go/applyconfiguration/jobset/v1alpha2/jobsetstatus.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions client-go/applyconfiguration/utils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 72b6e44

Please sign in to comment.