diff --git a/README.md b/README.md index e567aed053..4f60bc6d82 100644 --- a/README.md +++ b/README.md @@ -97,8 +97,19 @@ The semantics are as follows * The overall status of the TFJob is determined by the exit code of the tensorflow container * 0 = success - * 1-127 = permanent error - * 128-255 = retryable error + * 1 || 2 || 126 || 127 || 128 || 139 = permanent errors: + * 1: general errors + * 2: misuse of shell builtins + * 126: command invoked cannot execute + * 127: command not found + * 128: invalid argument to exit + * 139: contianer terminated by SIGSEGV(Invalid memory reference) + * 130 || 137 || 143 = retryable error for unexpected system signals: + * 130: contianer terminated by Control-C + * 137: container received a SIGKILL + * 143: container received a SIGTERM + * 138 = reserved in tf-operator for user specified retryable errors + * others = undefined and no guarantee **worker** * A job can have 0 to N workers diff --git a/pkg/trainer/replicas.go b/pkg/trainer/replicas.go index c8cb6407d4..253243b9ac 100644 --- a/pkg/trainer/replicas.go +++ b/pkg/trainer/replicas.go @@ -355,6 +355,7 @@ func replicaStatusFromPodList(l v1.PodList, name string) tfv1alpha1.ReplicaState return tfv1alpha1.ReplicaStateRunning } + // TODO(wenzhel): Should consider to return more info about pod to users. return tfv1alpha1.ReplicaStateFailed } diff --git a/pkg/trainer/training.go b/pkg/trainer/training.go index 823de9ed06..9be40b33cc 100644 --- a/pkg/trainer/training.go +++ b/pkg/trainer/training.go @@ -36,6 +36,7 @@ import ( tfjobclient "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned" "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/scheme" "github.com/kubeflow/tf-operator/pkg/util" + train_util "github.com/kubeflow/tf-operator/pkg/util/train" ) // TODO(jlewi): We should switch a New pattern and make trainingJob private so we can @@ -190,8 +191,6 @@ func (j *TrainingJob) GetStatus() (tfv1alpha1.State, []*tfv1alpha1.TFReplicaStat // isRetryableTerminationState returns true if a container terminated in a state // that we consider retryable. func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool { - // TODO(jlewi): Need to match logic in - // https://cs.corp.google.com/piper///depot/google3/cloud/ml/beta/job/training_job_state_util.cc?l=88 if s.Reason == "OOMKilled" { // If the user's process causes an OOM and Docker kills the container, // the termination reason of ContainerState will be specified to @@ -203,27 +202,7 @@ func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool { return false } - // TODO(jlewi): Should we use the exit code reported in the termination - // log message and not the ExitCode reported by the container. - - if s.ExitCode >= 0 && s.ExitCode <= 127 { - // For the exit_code in [0, 127]: - // 0 means success, - // 1 - 127 corresponds to permanent user errors. - // We don't want to retry for both cases. - // More info about exit status can be found in: - // https://www.gnu.org/software/bash/manual/html_node/Exit-Status.html - return false - } - - // For the remaining cases that exit_code from workers that doesn't - // fall into [0, 127]. They can be: - // 137 corresponds to SIGKILL, - // 143 corresponds to SIGTERM, - // other values that have undefined behavior. - // We treat them as internal errors for now and all the internal errors - // will be retired. - return true + return train_util.IsRetryableExitCode(s.ExitCode) } // masterName returns the name of master replica of provided training job @@ -268,7 +247,7 @@ func (j *TrainingJob) setup(config *tfv1alpha1.ControllerConfig) { } } -// // setupReplicas creates in memory data structures corresponding to the replicas. +// setupReplicas creates in memory data structures corresponding to the replicas. func (j *TrainingJob) setupReplicas() error { if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) { j.Replicas = make([]*TFReplicaSet, 0, len(j.job.Spec.ReplicaSpecs)) diff --git a/pkg/trainer/training_test.go b/pkg/trainer/training_test.go index 5c9718c613..128a3889c1 100644 --- a/pkg/trainer/training_test.go +++ b/pkg/trainer/training_test.go @@ -69,7 +69,7 @@ func TestIsRetryableTerminationState(t *testing.T) { ExitCode: 244, Message: "some reason", }, - Expected: true, + Expected: false, }, { State: v1.ContainerStateTerminated{ @@ -78,6 +78,34 @@ func TestIsRetryableTerminationState(t *testing.T) { }, Expected: false, }, + { + // Exit code that indicates container was killed by SIGKILL. + State: v1.ContainerStateTerminated{ + ExitCode: 137, + }, + Expected: true, + }, + { + // Exit code reserved for user defined retryable errors. + State: v1.ContainerStateTerminated{ + ExitCode: 138, + }, + Expected: true, + }, + { + // Exit code that indicates container was killed by SIGSEGV. + State: v1.ContainerStateTerminated{ + ExitCode: 139, + }, + Expected: false, + }, + { + // Exit code that indicates container was killed by SIGTERM. + State: v1.ContainerStateTerminated{ + ExitCode: 143, + }, + Expected: true, + }, } for _, c := range cases { diff --git a/pkg/util/train/train_util.go b/pkg/util/train/train_util.go new file mode 100644 index 0000000000..e412058f6c --- /dev/null +++ b/pkg/util/train/train_util.go @@ -0,0 +1,53 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package that various helper routines for training. +package train + +func IsRetryableExitCode(exitCode int32) bool { + if exitCode == 1 || exitCode == 2 || exitCode == 126 || + exitCode == 127 || exitCode == 128 || exitCode == 139 { + // Refers to http://tldp.org/LDP/abs/html/exitcodes.html, we identify the following exit codes + // as permanent errors: + // 1: General errors + // 2: Misuse of shell builtins + // 126: Command invoked cannot execute + // 127: Command not found + // 128: Invalid argument to exit + // 139(128+11): terminated by SIGSEGV(Invalid memory reference) + return false + } + + if exitCode == 130 || exitCode == 137 || exitCode == 143 { + // We think it's retryable error if the container exits due to the following sys signals + // that are usually caused by transient issues(e.g. VM was rescheduled): + // 130(128+2): Container terminated by Control-C + // 137(128+9): Container received a SIGKILL + // 143(128+15): Container received a SIGTERM + // The exit code of container will be 128 + n for fatal error signals. + // More info can be found in: + // http://tldp.org/LDP/abs/html/exitcodes.html, + // https://stackoverflow.com/questions/31297616/what-is-the-authoritative-list-of-docker-run-exit-codes + return true + } + + if exitCode == 138 { + // We allow users to specify exit code for the cases that they think should retry. + // We decide to take the exit code of SIGUSR1(138 = 128 + 10) for user defined retryable error. + return true + } + + // We make no guarantee for other exit status. Currently handling them same as permanent errors. + return false +}