From aa8af12764d53f82745da36827de6259656cbce2 Mon Sep 17 00:00:00 2001 From: Michal Wozniak Date: Thu, 23 Jun 2022 13:34:50 +0200 Subject: [PATCH] Add KEP for Retriable and non-retriable Pod failures for Jobs --- keps/prod-readiness/sig-apps/3329.yaml | 3 + .../README.md | 1542 +++++++++++++++++ .../kep.yaml | 46 + 3 files changed, 1591 insertions(+) create mode 100644 keps/prod-readiness/sig-apps/3329.yaml create mode 100644 keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md create mode 100644 keps/sig-apps/3329-retriable-and-non-retriable-failures/kep.yaml diff --git a/keps/prod-readiness/sig-apps/3329.yaml b/keps/prod-readiness/sig-apps/3329.yaml new file mode 100644 index 00000000000..0c043dd13f9 --- /dev/null +++ b/keps/prod-readiness/sig-apps/3329.yaml @@ -0,0 +1,3 @@ +kep-number: 3329 +alpha: + approver: "@johnbelamaric" diff --git a/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md b/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md new file mode 100644 index 00000000000..f92e5fe1eb2 --- /dev/null +++ b/keps/sig-apps/3329-retriable-and-non-retriable-failures/README.md @@ -0,0 +1,1542 @@ + +# KEP-3329: Retriable and non-retriable Pod failures for Jobs + + + + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Job-level vs. pod-level spec](#job-level-vs-pod-level-spec) + - [Relationship with Pod.spec.restartPolicy](#relationship-with-podspecrestartpolicy) + - [Current state review](#current-state-review) + - [Preemption](#preemption) + - [Taint-based eviction](#taint-based-eviction) + - [Node drain](#node-drain) + - [Node-pressure eviction](#node-pressure-eviction) + - [OOM kill](#oom-kill) + - [Disconnected node](#disconnected-node) + - [Direct container kill](#direct-container-kill) + - [Termination initiated by Kubelet](#termination-initiated-by-kubelet) + - [JobSpec API alternatives](#jobspec-api-alternatives) + - [Risks and Mitigations](#risks-and-mitigations) + - [Garbage collected pods](#garbage-collected-pods) + - [Evolving condition types](#evolving-condition-types) +- [Design Details](#design-details) + - [New PodConditions](#new-podconditions) + - [JobSpec API](#jobspec-api) + - [Evaluation](#evaluation) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Deprecation](#deprecation) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Upgrade](#upgrade) + - [Downgrade](#downgrade) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Only support for exit codes](#only-support-for-exit-codes) + - [Using Pod status.reason field](#using-pod-statusreason-field) + - [Using of various PodCondition types](#using-of-various-podcondition-types) + - [More nodeAffinity-like JobSpec API](#more-nodeaffinity-like-jobspec-api) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [x] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [x] (R) KEP approvers have approved the KEP status as `implementable` +- [x] (R) Design details are appropriately documented +- [x] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests for meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [x] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [x] (R) Production readiness review completed +- [x] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +This KEP extends Kubernetes to configure a job policy for handling pod failures. +In particular, the extension allows determining some of pod failures as caused +by infrastructure errors and to retry them without incrementing the counter +towards `backoffLimit`. + +Additionally, the extension allows determining some pod failures as caused by +software bugs and to terminate the associated job early. This is needed to save +time and computational resources wasted due to unnecessary retries of containers +destined to fail due to software bugs. + + + +## Motivation + +Running a large computational workload, comprising thousands of pods on +thousands of nodes requires usage of pod restart policies in order +to account for infrastructure failures. + +Currently, kubernetes Job API offers a way to account for infrastructure +failures by setting `.backoffLimit > 0`. However, this mechanism intructs the +job controller to restart all failed pods - regardless of the root cause +of the failures. Thus, in some scenarios this leads to unnecessary +restarts of many pods, resulting in a waste of time and computational +resources. What makes the restarts more expensive is the fact that the +failures may be encountered late in the execution time of a program. + +Sometimes it can be determined from containers exit codes +that the root cause of a failure is in the executable and the +job is destined to fail regardless of the number of retries. However, since +the large workloads are often scheduled to run over night or over the +weekend, there is no human assistance to terminate such a job early. + +The need for solving the problem has been emphasized by the kubernetes +community in the issues, see: [#17244](https://github.com/kubernetes/kubernetes/issues/17244) and [#31147](https://github.com/kubernetes/kubernetes/issues/31147). + +Some third-party frameworks have implemented retry policies for their pods: +- [TensorFlow Training (TFJob)](https://www.kubeflow.org/docs/components/training/tftraining/) +- [Argo workflow](https://github.com/argoproj/argo-workflows/blob/master/examples/retry-conditional.yaml) ([example](https://github.com/argoproj/argo-workflows/blob/master/examples/retry-conditional.yaml)) + +Additionally, some pod failures are not linked with the container execution, +but rather with the internal kubernetes cluster management (see: +[Scheduling, Preemption and Eviction](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/)). +Such pod failures should be recognized as infrastructure failures and it +should be possible to ignore them from the counter towards `backoffLimit`. + + + +### Goals + +- Extension of Job API with user-friendly syntax to terminate jobs based on the + end state of the failed pod. + + + +### Non-Goals + +- Implementation of other indicators of non-retriable jobs such as termination logs. +- Modification of the semantics for job termination. In particular, allowing for + all indexes of an indexed-job to execute when only one or a few indexes fail + [#109712](https://github.com/kubernetes/kubernetes/issues/109712). +- Similar termination policies for other workload controllers such as Deployments + or StatefulSets. +- Handling of Pod configuration errors resulting in pods stuck in the `Pending` + state (value of `status.phase`) rather than `Failed` (such as incorrect image + name, non-matching configMap references, incorrect PVC references). + + + +## Proposal + +Extension of the Job API with a new field which allows to configure the set of +conditions and associated actions which determine how a pod failure is handled. +The extended Job API supports discrimination of pod failures based on the +container exit codes as well as based on the end state of a failed pod. + +In order to support discrimination of pod failures based on their end state +we use the already existing `status.conditions` field to append a dedicated Pod +Condition indicating (by its type) that the pod is being terminated by an +internal kubernetes component. Moreover, we modify the internal kubernetes +components to send an API call to append the dedicated Pod condition along with +sending the associated Pod delete request. In particluar, the following +kubernetes components will be modified: +- kube-controller-manager (taint manager performing pod eviction) +- kube-scheduler (when performing `Preemption`) + +We use the job controller's main loop to detect and categorize the pod failures +with respect to the configuration. For each failed pod, one of the following +actions is applied: +- terminate the job (non-retriable failure), +- ignore the failure (retriable failure) - restart the pod and do not increment + the counter for `backoffLimit`, +- increment the `backoffLimit` counter and restart the pod if the limit is not + reached (current behaviour). + + + +### User Stories (Optional) + + + +#### Story 1 + +As a machine learning researcher, I run jobs comprising thousands +of long-running pods on a cluster comprising thousands of nodes. The jobs often +run at night or over weekend without any human monitoring. In order to account +for random infrastructure failures we define `.backoffLimit: 6` for the job. +However, a signifficant portion of the failures happen due to bugs in code. +Moreover, the failures may happen late during the program execution time. In +such case, restarting such a pod results in wasting a lot of computational time. + +We would like to be able to automatically detect and terminate the jobs which are +failing due to the bugs in code of the executable, so that the computation resources +can be saved. + +Occasionally, our executable fails, but it can be safely restarted with a good +chance of succeeding the next time. In such known retriable situations our +executable exits with a dedicated exit code in the 40-42 range. All remaining +exit codes indicate a software bug and should result in an early job termination. + +The following Job configuration could be a good starting point to satisfy +my needs: + +```yaml +apiVersion: v1 +kind: Job +spec: + template: + spec: + containers: + - name: job-container + image: job-image + command: ["./program"] + backoffLimit: 6 + backoffPolicy: + rules: + - action: Terminate + onExitCode: + operator: NotIn + values: [40,41,42] +``` + +Note that, when no rule specified in `backoffPolicy` matches the pod failure +the default handling of pod failures applies - the counter of pod failures +is incremented and checked against the `backoffLimit` +(see: [JobSpec API](#jobspec-api)]). + +#### Story 2 + +As a service provider that offers computational resources to researchers I would like to +have a mechanism which terminates jobs for which pods are failing due to user errors, +but allows infinite retries for pod failures caused by cluster-management +events (such as preemption). I do not have knowledge or influence over the executable that researchers run, +so I don't know beforehand which exit codes they might return. + +The following Job configuration could be a good starting point to satisfy +my needs: + +```yaml +apiVersion: v1 +kind: Job +spec: + template: + spec: + containers: + - name: main-job-container + image: job-image + command: ["./program"] + - name: monitoring-job-container + image: job-monitoring + command: ["./monitoring"] + backoffLimit: 3 + backoffPolicy: + rules: + - action: Ignore + onPodConditions: + operator: In + values: [ PreemptedByScheduler, EvictedByTaints ] +``` + +Note that, in this case the user supplies a list of Pod condition type values. +This approach is likely to require an iterative process to review and extend of +the list. + +### Notes/Constraints/Caveats (Optional) + +#### Job-level vs. pod-level spec + +We considered introduction of this feature in the pod spec, allowing to account for +container restarts within a running pod. However, we consider handling of +job-level failures or termination (for example, a Preemption) as an integral part of +this proposal. Also, when pod's `spec.restartPolicy` is specified as `Never`, then the +failures can't be handled by kubelet and need to be handled at job-level anyway. + +Also, we consider this proposal as a natural extension of the already exiting +mechanism for job-level restarts of failed pods based on the Job's +`spec.backoffLimit` configuration. In particular, this proposal aims to fix the +issue, in this mechanism, of unnecessary restarts when a Job can be determined +to fail despite retries. + +We believe this feature can co-exist with other pod-level features providing +container restart policies. However, if we establish there is a problematic +interaction of this feature with another such feature, then we will consider +additional validation of the JobSpec configuration to avoid the situation. + +If, in the future, we introduce failure handling within the Pod spec, it would +be limited to restartPolicy=OnFailure. Only one of the Pod spec or Job spec APIs +will be allowed to be used at a time. + +#### Relationship with Pod.spec.restartPolicy + +For Alpha we may limit this feature by disallowing the use of `onExitCodes` when +`restartPolicy=OnFailure`. This is in order to avoid the problematic +race-conditions between Kubelet and Job controller. For example, Kubelet +could restart a failed container before the Job controller decides to terminate +the corresponding job due to a rule using `onExitCodes`. On the other hand, +the unnecessary container restart may not be too much of an issue. We are going +to re-evaluate if we want to support `onExitCodes` combined with +`restartPolicy=OnFailure` in Beta. + +#### Current state review + +Here we review the current state of kubernetes (version 1.24) regarding its +handling pod failures. + +The list below contains scenarios which we have reproduced in order to +investigate which pod fields could be used as indicators if a pod failure +should or should not be retried. + +The results demonstrate that there is no universal indicator (like a +pod or container field) currently that discriminates pod failures which should +be retried from those which should not be retried. + +##### Preemption + +- Reproduction: We run two long-running jobs. The second has higher priority + pod which preempts the lower priority pod +- Comments: controlled by kube-scheduler in `scheduler/framework/preemption/preemption.go` +- Pod status: + - status: Terminating + - `phase=Failed` + - `reason=` + - `message=` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=Error` +- Retriable: Yes + +##### Taint-based eviction + +- Reproduction: We run a long-running job. Then, we taint the node with `NoExecute` +- Comments: controlled by kube-scheduler in `controller/nodelifecycle/scheduler/taint_manager.go` +- Pod status: + - status: Terminating + - `phase=Failed` + - `reason=` + - `message=` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=Error` +- Retriable: Yes + +##### Node drain + +- Reproduction: We run a job with a long-running pod, then drain the node + with the `kubectl drain` command +- Comments: performed by Eviction API, controlled by kube-apiserver in `registry/core/pod/storage/eviction.go` +- Pod status: + - status: Terminating + - `phase=Failed` + - `reason=` + - `message=` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=Error` +- Retriable: Yes + +##### Node-pressure eviction + +Memory-pressure eviction: + +- Reproduction: We run a job with a pod which attempts to allocate more + memory than available on the node +- Comments: controlled by kubelet in `kubelet/eviction/eviction_manager.go` +- Pod status: + - status: ContainerStatusUnknown + - `phase=Failed` + - `reason=Evicted` + - `message=The node was low on resource: memory. (...)` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=ContainerStatusUnknown` +- Retriable: Unclear, excessive memory usage suggests a bug or misconfiguration. + However, a restart on another node may succeed + +Disk-pressure eviction: + +- Reproduction: We run a job with a pod which attempts to write more + data than the disk space available on the node +- Comments: controlled by kubelet in `kubelet/eviction/eviction_manager.go` +- Pod status: + - status: Error + - `phase=Failed` + - `reason=Evicted` + - `message=The node was low on resource: ephemeral-storage. (...)` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=Error` +- Retriable: Unclear, excessive disk usage suggests a bug or misconfiguration. + However, a restart on another node may succeed + +##### OOM kill + +- Reproduction: We run a job with a pod which attempts to allocate more + memory than constrained in the container spec by `resources.limits.memory` +- Comments: handled by kubelet +- Pod status: + - status: OOMKilled + - `phase=Failed` + - `reason=` + - `message=` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=OOMKilled` +- Retriable: Unclear, but if occurs when + `resources.requests.memory=resources.limits.memory` it strongly suggests + a software bug. + +##### Disconnected node + +- Reproduction: We run a job with a long-running pod, then disconnect the node + and delete it by the `kubectl delete` command +- Comments: handled by Pod Garbage collector in: `controller/podgc/gc_controller.go`. + However, the pod phase remains `Running`. +- Pod status: + - status: Terminating + - `phase=Running` + - `reason=` + - `message=` +- Container status: + - `state=Running` + - `exitCode=` + - `reason=` +- Retriable: Yes + +##### Direct container kill + +- Reproduction: We run a job with a long-running pod, then we kill the container +by the `crictl stop` command +- Comments: handled by Kubelet +- Pod status: + - status: Error + - `phase=Failed` + - `reason=` + - `message=` +- Container status: + - `state=Ternminated` + - `exitCode=137` + - `reason=Error` +- Retriable: Yes + +#### Termination initiated by Kubelet + +For Alpha, we limit this feature by not recognizing Pod failures initiated +by Kubelet. This is because it is hard to determine in some scenarios of +Pod failures initiated by Kubelet if they should be retried or should not. +For example, when Kubelet evicts a pod due to node pressure it might mean either +software bug (then it might be better to terminate the entire job) or the node +being low on memory due to other processes (in which case it is sensible to retry). +We are going to re-evaluate for Beta if we want to add support for recognizing +pod terminations initiated by Kubelet. + +#### JobSpec API alternatives + +Alternative versions of the JobSpec API to define requirements on exit codes and +on pod end state have been proposed and discussed (see: [Alternatives](#alternatives)). +The outcome of the discussions as well as the experience gained during the Alpha +implementation may influence the final API. + + + +### Risks and Mitigations + +#### Garbage collected pods + +The Pod status (which includes the `conditions` field and the container exit +codes) could be lost if the failed pod is garbage collected. + +Losing Pod's status before it is interpreted by Job Controller can be prevented +by using the feature of [job tracking with finalizers](https://kubernetes.io/docs/concepts/overview/working-with-objects/finalizers/). + +#### Evolving condition types + +The list of available PodCondition types field will be evolving with new +values being added and potentially some values becoming obsolete. This can make +it difficult to maintain a valid list of PodCondition types enumerated in +the Job configuration. + +In order to mitigate this risk we are going to define (along with +documentation) the new condition types as constants in the already existing list +defined in the k8s.io/apis/core/v1 package. Thus, every addition of a new +condition type will require an API review. The constants will allow users of the +package to reduce the risk of typing mistakes. + +Additionally, for Beta, we will re-evaluate an idea of a generic opinionated condition +type indicating that a pod can be retried, for example `TerminatedByControlPlane`. + +Finally, we are going to cover the handling of pod failures associated with the +new PodCondition types in integration tests. + + + +## Design Details + +As our [review](#current-state-review) shows there is currently no convenient +indicator, in the pod end state, if the pod should be retried or should not. +Thus, we introduce a set of dedicated Pod conditions which can be used for this +reason. + +### New PodConditions + +The following condition types are introduced to account for different +reasons for pod termination (we focus on covering these scenarios were the new +condition makes it easier to determine if a failed pod should be restarted): +- PreemptedByScheduler (Pod preempted by kube-scheduler) +- EvictedByTaints (Pod evicted by kube-controller-manager due to taints) +- GarbageCollected (an orphaned Pod terminated by GC) +- EvictedByAPI (Pod evicted by Eviction API) +- EvictedFromUnhealthyNode (initiated by the node lifecycle controller) + +The already existing `status.conditions` field in Pod will be used by kubernetes +control plane components (kube-scheduler and kube-controller-manager) to append +a dedicated condition when they send the delete operation. + +The API call to append the condition will be issued as a pod status update call +before the Pod delete request. This way the Job controller will already see the +condition when handling a failed pod. + +During the implementation process we are going to review the places where the +pod delete requests are issued to modify the code to also append a meaningful +condition with dedicated `Type`, `Reason` and `Message` fields based on the +invocation context. + +### JobSpec API + +We extend the Job API in order to allow to apply different actions depending +on the conditions associated with the pod failure. + +```golang +// BackoffPolicyAction specifies how a Pod failure is handled. +type BackoffPolicyAction string + +const ( + + // This is an action which might be taken on pod failure - mark the + // pod's job as Failed and terminate all running pods. + BackoffActionTerminate BackoffPolicyAction = "Terminate" + + // This is an action which might be taken on pod failure - the pod will be + // restarted and the counter for .backoffLimit will not be incremented. + BackoffActionIgnore BackoffPolicyAction = "Ignore" +) + +type BackoffPolicyOnExitCodesOperator string + +const ( + BackoffPolicyOnExitCodesOpIn BackoffPolicyOnExitCodesOperator = "In" + BackoffPolicyOnExitCodesOpNotIn BackoffPolicyOnExitCodesOperator = "NotIn" +) + +type BackoffPolicyOnExitCodesRequirement struct { + // Restricts the check for exit codes to only apply to the container with + // the specified name. When empty the rule applies to all containers + // +optional + ContainerName *string + + // Represents the relationship between the container exit code(s) and the + // specified values. + Operator BackoffPolicyOnExitCodesOperator + + // Specifies the set of values. Each returned container exit code (might be + // multiple in case of multiple containers) is checked against this set of + // values with respect to the operator + Values []int +} + +type BackoffPolicyOnPodConditionsOperator string + +const ( + BackoffPolicyOnPodConditionsOpIn BackoffPolicyOnPodConditionsOperator = "In" +) + +type BackoffPolicyOnPodConditionsRequirement struct { + + // Represents the relationship between the actual Pod condition types + // and the set of specified Pod condition types + Operator BackoffPolicyOnPodConditionsOperator + + // Specifies the set of values. Each actual pod condition type, with status=True, + // is checked against this set of values with respect to the operator + Values []api.PodConditionType +} + +type BackoffPolicyRule struct { + // Specifies the action taken on a pod failure when the requirements are satisfied. + Action BackoffPolicyAction + + // Represents the requirement on the container exit code + // +optional + OnExitCodes *BackoffPolicyOnExitCodesRequirement + + // Represents the requirement on the pod failure conditions + // +optional + OnPodConditions *BackoffPolicyOnPodConditionsRequirement +} + +// BackoffPolicy describes how failed pods influence the backoffLimit. +type BackoffPolicy struct { + // A list of backoff policy rules. The rules are evaluated in order. + // Once a rule matches a Pod failure, the remaining of the rules are ignored. + // When no rule matches the Pod failure, the default handling applies - the + // counter of pod failures is incremented and it is checked against + // the backoffLimit + Rules []BackoffPolicyRule +} + +// JobSpec describes how the job execution will look like. +type JobSpec struct { + ... + // Specifies the policy of handling failed pods. In particular, it allows to + // specify the set of actions and conditions which need to be + // satisfied to take the associated action. + // If empty, the default behaviour applies - the counter of pod failed is + // incremented and it is checked against the backoffLimit + // +optional + BackoffPolicy *BackoffPolicy + ... +``` + +Note that, we do not introduce the `NotIn` operator in +`BackoffPolicyOnPodConditionsOperator` as its usage could make job +configurations error-prone and hard to maintain. + +Additionally, we validate the following constraints for each instance of +BackoffPolicyRule: +- exactly one of the fields `onExitCodes` and `OnPodConditions` is specified + for a requirement +- the specified `containerName` matches name of a configurated container + +Here is an example Job configuration which uses this API: + +```yaml +apiVersion: v1 +kind: Job +spec: + template: + spec: + containers: + - name: main-job-container + image: job-image + command: ["./program"] + - name: monitoring-job-container + image: job-monitoring + command: ["./monitoring"] + backoffLimit: 3 + backoffPolicy: + rules: + - action: Terminate + onExitCodes: + containerName: main-job-container + operator: In + values: [1,2,3] + - action: Ignore + onPodConditions: + operator: In + values: [ PreemptedByScheduler ] +``` + +### Evaluation + +We use the `syncJob` function of the Job controller to evaluate the specified +`backoffPolicy` rules against the failed pods. It is only the first rule with +matching requirements which is applied as the rules are evaluated in order. If +the pod failure does not match any of the specified rules, then default +handling of failed pods applies. + +If we limit this feature to use `onExitCodes` only when `restartPolicy=Never` +(see: [limitting this feature](#limitting-this-feature)), then the rules using +`onExitCodes` are evaluated only against the exit codes in the `state` field +(under `terminated.exitCode`) of `pod.status.containerStatuses` and +`pod.status.initContainerStatuses`. We may also need to check for the exit codes +in `lastTerminatedState` if we decide to support `onExitCodes` when +`restartPolicy=OnFailure`. + + + +### Test Plan + + + +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + + + +We assess that the Job controller (which is where the most complicated changes +will be done) has adequate test coverage for places which might be impacted by +this enhancement. Thus, no additional tests prior implementing this enhancement +are needed. + +##### Unit tests + + + +Unit tests will be added along with any new code introduced. In particular, +the following scenarios will be covered with unit tests: +- handling or ignoring of `spec.backoffPolicy` by the Job controller when the + feature gate is enabled or disabled, respectively, +- validation of a job configuration with respect to `spec.backoffPolicy` by + kube-apiserver +- handling of a pod failure, in accordance with the specified `spec.backoffPolicy`, + when the failure is associated with + - a failed container with non-zero exit code, + - a dedicated Pod condition indicating termmination originated by a kubernetes component + + +The core packages (with their unit test coverage) which are going to be modified during the implementation: +- `k8s.io/kubernetes/pkg/controller/job`: `13 June 2022` - `88%` +- `k8s.io/kubernetes/pkg/apis/batch/validation`: `13 June 2022` - `94.4%` +- `k8s.io/kubernetes/pkg/apis/batch/v1`: `13 June 2022` - `83.6%` + +##### Integration tests + +The following scenarios will be covered with integration tests: +- enabling, disabling and re-enabling of the feature gate +- pod failure is triggered by a delete API request along with appending a + Pod condition indicating termination originated by a kubernetes component + (we aim to cover all such scenarios) +- pod failure is caused by a failed container with a non-zero exit code + +More integration tests might be added to ensure good code coverage based on the +actual implemention. + + + +##### e2e tests + + +The following scenario will be covered with e2e tests: +- early job termination when a container fails with a non-retriable exit code + +More e2e test scenarios might be considered during implementation if practical. + +### Graduation Criteria + + +#### Alpha + +- Implementation: + - handling of failed pods with respect to `spec.backoffPolicy` by Job controller + - appending of a dedicated Pod condition (when the Pod termination is + initiated by a kubernetes control plane component) to the list of Pod + conditions along with sending the Pod delete request + - define as a constant and document the new Pod condition Type + - the feature is limited by disallowing of the use of `onExitCodes` when + `restartPolicy=OnFailure` +- The feature flag disabled by default +- Tests: unit and integration + +#### Beta + +- Address reviews and bug reports from Alpha users +- E2e tests are in Testgrid and linked in KEP +- A scalability test to demonstrate the limited impact of the additional API call + when terminating a Pod +- Re-evaluate modification to kubelet to send a dedicated condition when + terminating a Pod, based on user feedback +- Re-evaluate supporting of `onExitCodes` when `restartPolicy=OnFailure` +- Re-evaluate introduction of a generic opinionated condition type + indicating that a pod should be retried +- The feature flag enabled by default + +#### GA + +- Address reviews and bug reports from Beta users +- The feature is unconditionally enabled + + + +#### Deprecation + +N/A + +### Upgrade / Downgrade Strategy + +#### Upgrade + +An upgrade to a version which supports this feature should not require any +additional configuration changes. In order to use this feature after an upgrade +users will need to configure their Jobs by specifying `spec.backoffPolicy`. The +only noticeable difference in behaviour, without specifying `spec.backoffPolicy`, +is that Pods terminated by kubernetes components will have an additional +condition appended to `status.conditions`. + +#### Downgrade + +A downgrade to a version which does not support this feature should not require +any additional configuration changes. Jobs which specified +`spec.backoffPolicy` (to make use of this feature) will be handled in a +default way. + + + +### Version Skew Strategy + +This feature uses an additional API call between kubernetes components to +append a Pod condition when terminating a pod. However, this API call uses +pre-existing API so the version skew does not introduce runtime compatibility +issues. + +We use the feature gate strategy for coordination of the feature enablement +between components. + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [x] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: JobBackoffPolicy + - Components depending on the feature gate: + - kube-apiserver + - kube-controller-manager + - kube-scheduler +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled). + +###### Does enabling the feature change any default behavior? + +Yes. The kubernetes components (kube-scheduler and kube-controller-manager) will +append a Pod Condition along with the request pod delete request. + +However, the part of the feature responsible for handling of the failed pods +is opt-in with `.spec.backoffPolicy`. + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + +Yes. Using the feature gate is the recommended way. When the feature is disabled +the Job controller manager handles pod failures in the default way even if +`spec.backoffPolicy` is specified. Additionally, the dedicated Pod Conditions +are no longer appended along with delete requests. + + + +###### What happens if we reenable the feature if it was previously rolled back? + +The Job controller starts to handle pod failures according to the specified +`spec.backoffPolicy`. Additionally, again, along with the delete requests, the +dedicated Pod Conditions are appended to Pod's `status.condition`. + +###### Are there any tests for feature enablement/disablement? + +Yes, unit and integration test for the feature enabled, disabled and transitions. + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + +We use the metrics-based approach based on the following metrics (exposed by +kube-controller-manager): + - `job_finished_total` (existing, extended by a label): the new `reason` +label indicates the reason for the job termination. Possible values are +`BackoffPolicyRule`, `BackoffLimitExceeded` and`DeadlineExceeded`. +It can be used to determine what is the relative frequency of job terminations +due to different reasons. For example, if jobs are terminated often due to +`BackoffLimitExceeded` it may suggest that the backoff policy should be extended +with new rules to terminate jobs early more often + - `job_pod_failure_total` (new): tracks the handling of failed pods. It will +have the `action` label indicating how a pod failure was handled. Possible +values are:`JobTerminated`, `Ignored` and `Default`. This metric can be used to +assess the coverage of pod failure scenarios with `spec.backoffPolicy` rules. + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [x] Metrics + - Metric name: + - `job_sync_duration_seconds` (existing): can be used to see how much the +feature enablement increases the time spent in the sync job + - Components exposing the metric: kube-controller-manager + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + +Yes. An API call to append a Pod condition when deleting the Pod. + + + +###### Will enabling / using this feature result in introducing new API types? + +No. + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + +No. + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + +No. + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + +No. + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + +The additional CPU and memory increase in kube-controller-manager related to +handling of failed pods is negligible and only limited to these jobs which +specify `spec.backoffPolicy`. + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + +### Only support for exit codes + +We considered supporting just exit codes when defining the policy for handling +pod failures. However, this approach alone would not be sufficient to +distinguish pod failures caused by infrastructure issues. A special handling +of such failures is important in some use cases (see: [Story 2](#story-2)). + +### Using Pod status.reason field + +We considered using of the pod's `status.reason` field to determine +the reason for a pod failure. This field would be set based on the DeleteOptions +reason field associated with the delete API requests. However, this approach is +problematic as then the field would be used to set by multiple components +leading to race-conditions. Also reasons could be arbitrary strings, making it +hard for users to know which reasons to look for in each version. + +### Using of various PodCondition types + +We considered introducing a set of dedicated PodCondition types +corresponding to different components or reasons in which a pod deletion +is triggered. However, this could be problematic as the list of available +PodCondition types field would be evolving with new values being added and +potentially some values becoming obsolete. This could make it difficult to +maintain a valid list of PodCondition types enumerated in the Job configuration. + +### More nodeAffinity-like JobSpec API + +Along with introduction of the set of PodCondition types we also considered +a more nodeAffinity-like JobSpec API being able to match against multiple +condition types. It would also support the `key` field for constraining the +`status` field (and potentially other fields). This is an example Job +spec using such API: + +```yaml + backoffPolicy: + rules: + - action: Ignore + - onPodConditions: + - key: Type + operator: In + values: + - Evicted + - Preempted + - key: Status + operator: In + values: + - True +``` + +Such API, while more flexible, might be harder to use in practice. Thus, in the +first iteration of the feature, we intend to provide a user-friendly API +targeting the known use-cases. A more flexible API can be considered as a future +improvement. + + + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-apps/3329-retriable-and-non-retriable-failures/kep.yaml b/keps/sig-apps/3329-retriable-and-non-retriable-failures/kep.yaml new file mode 100644 index 00000000000..6ac29d42656 --- /dev/null +++ b/keps/sig-apps/3329-retriable-and-non-retriable-failures/kep.yaml @@ -0,0 +1,46 @@ +title: Retriable and non-retriable Pod failures for Jobs +kep-number: 3329 +authors: + - "@mimowo" +owning-sig: sig-apps +participating-sigs: + - sig-scheduling +status: implementable +creation-date: 2022-06-07 +reviewers: + - "@liggitt" + - "@bobbypage" +approvers: + - "@soltysh" + - "@alculquicondor" + - "@deads2k" + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.25" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.25" + beta: "v1.26" + stable: "v1.27" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: JobBackoffPolicy + components: + - kube-apiserver + - kube-controller-manager + - kube-scheduler +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - job_sync_duration_seconds + - job_finished_total + - job_pod_failure_total