Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pod: Submit an event when the user specifies the restartpolicy for pod template #648

Merged
merged 3 commits into from
Jun 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions pkg/controller.v2/controller_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@ import (
)

const (
// tfConfig is the environment variable name of TensorFlow cluster spec.
tfConfig = "TF_CONFIG"

// podTemplateRestartPolicyReason is the warning reason when the restart
// policy is setted in pod template.
podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy"
)

// reconcilePods checks and updates pods for each given TFReplicaSpec.
Expand Down Expand Up @@ -152,14 +157,14 @@ func (tc *TFJobController) createNewPod(tfjob *tfv1alpha2.TFJob, rt, index strin
return err
}

if spec.RestartPolicy == tfv1alpha2.RestartPolicyExitCode {
podTemplate.Spec.RestartPolicy = v1.RestartPolicyNever
} else if spec.RestartPolicy == tfv1alpha2.RestartPolicy("") {
// Set default to Never.
podTemplate.Spec.RestartPolicy = v1.RestartPolicyNever
} else {
podTemplate.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy)
// Submit a warning event if the user specifies restart policy for
// the pod template. We recommend to set it from the replica level.
if podTemplate.Spec.RestartPolicy != v1.RestartPolicy("") {
errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec"
loggerForReplica(tfjob, rt).Warning(errMsg)
tc.recorder.Event(tfjob, v1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg)
}
setRestartPolicy(podTemplate, spec)

err = tc.podControl.CreatePodsWithControllerRef(tfjob.Namespace, podTemplate, tfjob, controllerRef)
if err != nil && errors.IsTimeout(err) {
Expand Down Expand Up @@ -200,6 +205,17 @@ func setClusterSpec(podTemplateSpec *v1.PodTemplateSpec, tfjob *tfv1alpha2.TFJob
return nil
}

func setRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *tfv1alpha2.TFReplicaSpec) {
if spec.RestartPolicy == tfv1alpha2.RestartPolicyExitCode {
podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicyNever
} else if spec.RestartPolicy == tfv1alpha2.RestartPolicy("") {
// Set default to Never.
podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicyNever
} else {
podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy)
}
}

// getPodsForTFJob returns the set of pods that this tfjob should manage.
// It also reconciles ControllerRef by adopting/orphaning.
// Note that the returned Pods are pointers into the cache.
Expand Down
68 changes: 68 additions & 0 deletions pkg/controller.v2/controller_pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,71 @@ func TestClusterSpec(t *testing.T) {
}
}
}

func TestRestartPolicy(t *testing.T) {
type tc struct {
tfJob *tfv1alpha2.TFJob
expectedRestartPolicy v1.RestartPolicy
expectedType tfv1alpha2.TFReplicaType
}
testCase := []tc{
func() tc {
tfJob := newTFJob(1, 0)
specRestartPolicy := tfv1alpha2.RestartPolicyExitCode
tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy
return tc{
tfJob: tfJob,
expectedRestartPolicy: v1.RestartPolicyNever,
expectedType: tfv1alpha2.TFReplicaTypeWorker,
}
}(),
func() tc {
tfJob := newTFJob(1, 0)
specRestartPolicy := tfv1alpha2.RestartPolicyNever
tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy
return tc{
tfJob: tfJob,
expectedRestartPolicy: v1.RestartPolicyNever,
expectedType: tfv1alpha2.TFReplicaTypeWorker,
}
}(),
func() tc {
tfJob := newTFJob(1, 0)
specRestartPolicy := tfv1alpha2.RestartPolicyAlways
tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy
return tc{
tfJob: tfJob,
expectedRestartPolicy: v1.RestartPolicyAlways,
expectedType: tfv1alpha2.TFReplicaTypeWorker,
}
}(),
func() tc {
tfJob := newTFJob(1, 0)
specRestartPolicy := tfv1alpha2.RestartPolicyOnFailure
tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy
return tc{
tfJob: tfJob,
expectedRestartPolicy: v1.RestartPolicyOnFailure,
expectedType: tfv1alpha2.TFReplicaTypeWorker,
}
}(),
func() tc {
tfJob := newTFJob(1, 0)
specRestartPolicy := tfv1alpha2.RestartPolicy("")
tfJob.Spec.TFReplicaSpecs[tfv1alpha2.TFReplicaTypeWorker].RestartPolicy = specRestartPolicy
return tc{
tfJob: tfJob,
expectedRestartPolicy: v1.RestartPolicyNever,
expectedType: tfv1alpha2.TFReplicaTypeWorker,
}
}(),
}
for _, c := range testCase {
spec := c.tfJob.Spec.TFReplicaSpecs[c.expectedType]
podTemplate := spec.Template
setRestartPolicy(&podTemplate, spec)
if podTemplate.Spec.RestartPolicy != c.expectedRestartPolicy {
t.Errorf("Expected %s, got %s", c.expectedRestartPolicy, podTemplate.Spec.RestartPolicy)
}
}
}