Skip to content

Commit

Permalink
Only identify specific exit codes as retryable error (kubeflow#518)
Browse files Browse the repository at this point in the history
* Should return ReplicaStateFailed if container exit code is not 0

* Update the criteria for retryable errors.

* Reformat

* Reformat

* Reformat

* Fix lint error.

* Handle the exit code more explicitly.

* Reformat.

* Create a util func for IsRetryableExitCode.
  • Loading branch information
wenzhel101 authored and Penghui Yan committed Jun 18, 2018
1 parent b3d18a1 commit 2386e11
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 27 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,19 @@ The semantics are as follows
* The overall status of the TFJob is determined by the exit code of the
tensorflow container
* 0 = success
* 1-127 = permanent error
* 128-255 = retryable error
* 1 || 2 || 126 || 127 || 128 || 139 = permanent errors:
* 1: general errors
* 2: misuse of shell builtins
* 126: command invoked cannot execute
* 127: command not found
* 128: invalid argument to exit
* 139: contianer terminated by SIGSEGV(Invalid memory reference)
* 130 || 137 || 143 = retryable error for unexpected system signals:
* 130: contianer terminated by Control-C
* 137: container received a SIGKILL
* 143: container received a SIGTERM
* 138 = reserved in tf-operator for user specified retryable errors
* others = undefined and no guarantee

**worker**
* A job can have 0 to N workers
Expand Down
1 change: 1 addition & 0 deletions pkg/trainer/replicas.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ func replicaStatusFromPodList(l v1.PodList, name string) tfv1alpha1.ReplicaState
return tfv1alpha1.ReplicaStateRunning
}

// TODO(wenzhel): Should consider to return more info about pod to users.
return tfv1alpha1.ReplicaStateFailed
}

Expand Down
27 changes: 3 additions & 24 deletions pkg/trainer/training.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned"
"github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme"
"github.com/kubeflow/tf-operator/pkg/util"
train_util "github.com/kubeflow/tf-operator/pkg/util/train"
)

// TODO(jlewi): We should switch a New pattern and make trainingJob private so we can
Expand Down Expand Up @@ -190,8 +191,6 @@ func (j *TrainingJob) GetStatus() (tfv1alpha1.State, []*tfv1alpha1.TFReplicaStat
// isRetryableTerminationState returns true if a container terminated in a state
// that we consider retryable.
func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool {
// TODO(jlewi): Need to match logic in
// https://cs.corp.google.com/piper///depot/google3/cloud/ml/beta/job/training_job_state_util.cc?l=88
if s.Reason == "OOMKilled" {
// If the user's process causes an OOM and Docker kills the container,
// the termination reason of ContainerState will be specified to
Expand All @@ -203,27 +202,7 @@ func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool {
return false
}

// TODO(jlewi): Should we use the exit code reported in the termination
// log message and not the ExitCode reported by the container.

if s.ExitCode >= 0 && s.ExitCode <= 127 {
// For the exit_code in [0, 127]:
// 0 means success,
// 1 - 127 corresponds to permanent user errors.
// We don't want to retry for both cases.
// More info about exit status can be found in:
// https://www.gnu.org/software/bash/manual/html_node/Exit-Status.html
return false
}

// For the remaining cases that exit_code from workers that doesn't
// fall into [0, 127]. They can be:
// 137 corresponds to SIGKILL,
// 143 corresponds to SIGTERM,
// other values that have undefined behavior.
// We treat them as internal errors for now and all the internal errors
// will be retired.
return true
return train_util.IsRetryableExitCode(s.ExitCode)
}

// masterName returns the name of master replica of provided training job
Expand Down Expand Up @@ -268,7 +247,7 @@ func (j *TrainingJob) setup(config *tfv1alpha1.ControllerConfig) {
}
}

// // setupReplicas creates in memory data structures corresponding to the replicas.
// setupReplicas creates in memory data structures corresponding to the replicas.
func (j *TrainingJob) setupReplicas() error {
if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) {
j.Replicas = make([]*TFReplicaSet, 0, len(j.job.Spec.ReplicaSpecs))
Expand Down
30 changes: 29 additions & 1 deletion pkg/trainer/training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func TestIsRetryableTerminationState(t *testing.T) {
ExitCode: 244,
Message: "some reason",
},
Expected: true,
Expected: false,
},
{
State: v1.ContainerStateTerminated{
Expand All @@ -78,6 +78,34 @@ func TestIsRetryableTerminationState(t *testing.T) {
},
Expected: false,
},
{
// Exit code that indicates container was killed by SIGKILL.
State: v1.ContainerStateTerminated{
ExitCode: 137,
},
Expected: true,
},
{
// Exit code reserved for user defined retryable errors.
State: v1.ContainerStateTerminated{
ExitCode: 138,
},
Expected: true,
},
{
// Exit code that indicates container was killed by SIGSEGV.
State: v1.ContainerStateTerminated{
ExitCode: 139,
},
Expected: false,
},
{
// Exit code that indicates container was killed by SIGTERM.
State: v1.ContainerStateTerminated{
ExitCode: 143,
},
Expected: true,
},
}

for _, c := range cases {
Expand Down
53 changes: 53 additions & 0 deletions pkg/util/train/train_util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2018 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package that various helper routines for training.
package train

func IsRetryableExitCode(exitCode int32) bool {
if exitCode == 1 || exitCode == 2 || exitCode == 126 ||
exitCode == 127 || exitCode == 128 || exitCode == 139 {
// Refers to http://tldp.org/LDP/abs/html/exitcodes.html, we identify the following exit codes
// as permanent errors:
// 1: General errors
// 2: Misuse of shell builtins
// 126: Command invoked cannot execute
// 127: Command not found
// 128: Invalid argument to exit
// 139(128+11): terminated by SIGSEGV(Invalid memory reference)
return false
}

if exitCode == 130 || exitCode == 137 || exitCode == 143 {
// We think it's retryable error if the container exits due to the following sys signals
// that are usually caused by transient issues(e.g. VM was rescheduled):
// 130(128+2): Container terminated by Control-C
// 137(128+9): Container received a SIGKILL
// 143(128+15): Container received a SIGTERM
// The exit code of container will be 128 + n for fatal error signals.
// More info can be found in:
// http://tldp.org/LDP/abs/html/exitcodes.html,
// https://stackoverflow.com/questions/31297616/what-is-the-authoritative-list-of-docker-run-exit-codes
return true
}

if exitCode == 138 {
// We allow users to specify exit code for the cases that they think should retry.
// We decide to take the exit code of SIGUSR1(138 = 128 + 10) for user defined retryable error.
return true
}

// We make no guarantee for other exit status. Currently handling them same as permanent errors.
return false
}

0 comments on commit 2386e11

Please sign in to comment.