Skip to content

Commit

Permalink
Create resources(services/jobs) only once
Browse files Browse the repository at this point in the history
  • Loading branch information
ScorpioCPH committed Mar 1, 2018
1 parent 0928271 commit 4bb10fc
Showing 1 changed file with 26 additions and 23 deletions.
49 changes: 26 additions & 23 deletions pkg/trainer/training.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ func (j *TrainingJob) setup(config *tfv1alpha1.ControllerConfig) {

// setup Replicas. This creates in memory data structures corresponding to the replicas.
func (j *TrainingJob) setupReplicas() error {

if len(j.Replicas) != len(j.job.Spec.ReplicaSpecs) {
j.Replicas = make([]*TFReplicaSet, 0, len(j.job.Spec.ReplicaSpecs))
for _, t := range j.job.Spec.ReplicaSpecs {
Expand Down Expand Up @@ -331,10 +330,8 @@ func (j *TrainingJob) Reconcile(config *tfv1alpha1.ControllerConfig) error {
return err
}

// TODO(jlewi): Can we determine from the CRD status whether we should
// Create the resources or not? We need to ensure the resources exist so for
// now we always call Create.
if j.job.Status.Phase == tfv1alpha1.TFJobPhaseCreating || j.job.Status.Phase == tfv1alpha1.TFJobPhaseRunning {
// Use Status.Phase to determine whether we should create the resources or not.
if j.job.Status.Phase == tfv1alpha1.TFJobPhaseCreating {
// We call Create to make sure all the resources exist and are running.
if cErr := j.createResources(config); cErr != nil {
// TODO(jlewi): Should we eventually give up and mark the job as failed if we can't create the resources?
Expand All @@ -346,26 +343,32 @@ func (j *TrainingJob) Reconcile(config *tfv1alpha1.ControllerConfig) error {
log.Errorf("trainingJobCreateReplicas() error; %v", cErr)
return cErr
}
}

state, replicaStatuses, err := j.GetStatus()
// Call GetStatus in each reconcile loop
state, replicaStatuses, err := j.GetStatus()

j.status.ReplicaStatuses = replicaStatuses
if err != nil {
log.Errorf("GetStatus() for job %v returned error: %v", j.job.ObjectMeta.Name, err)
return err
}
// TODO(jlewi): We should update the Phase if we detect the job is done.
if state == tfv1alpha1.StateFailed {
log.Errorf("Master failed Job: %v.", j.job.ObjectMeta.Name)
j.status.Phase = tfv1alpha1.TFJobPhaseDone
j.status.State = tfv1alpha1.StateFailed
} else if state == tfv1alpha1.StateSucceeded {
log.Infof("Master succeeded Job: %v.", j.job.ObjectMeta.Name)
j.status.Phase = tfv1alpha1.TFJobPhaseDone
j.status.State = tfv1alpha1.StateSucceeded
} else {
log.Infof("Job %v status=%v", j.job.ObjectMeta.Name, util.Pformat(j.status))
}
j.status.ReplicaStatuses = replicaStatuses
if err != nil {
log.Errorf("GetStatus() for job %v returned error: %v", j.job.ObjectMeta.Name, err)
return err
}

// TODO(jlewi): We should update the Phase if we detect the job is done.
if state == tfv1alpha1.StateFailed {
log.Errorf("Master failed Job: %v.", j.job.ObjectMeta.Name)
j.status.Phase = tfv1alpha1.TFJobPhaseDone
j.status.State = tfv1alpha1.StateFailed
} else if state == tfv1alpha1.StateSucceeded {
log.Infof("Master succeeded Job: %v.", j.job.ObjectMeta.Name)
j.status.Phase = tfv1alpha1.TFJobPhaseDone
j.status.State = tfv1alpha1.StateSucceeded
} else if state == tfv1alpha1.StateRunning {
log.Infof("Master running Job: %v.", j.job.ObjectMeta.Name)
j.status.Phase = tfv1alpha1.TFJobPhaseRunning
j.status.State = tfv1alpha1.StateRunning
} else {
log.Infof("Job %v status=%v", j.job.ObjectMeta.Name, util.Pformat(j.status))
}

// If the phase changed we should update the CRD.
Expand Down

0 comments on commit 4bb10fc

Please sign in to comment.