Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: tfjob with restartPolicy=ExitCode not work (#1562) #1

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix: tfjob with restartPolicy=ExitCode not work (kubeflow#1562)
(cherry picked from commit 9cc1cc9)
cheimu authored and Derek Whatley committed Jul 26, 2022
commit eee8d6bebff324cb65b26dd2bb81ec249595702d
29 changes: 22 additions & 7 deletions pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
@@ -410,6 +410,20 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
r.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
}
}

// For the situation that jobStatus has a restarting condition, and append a running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to record the existing restarting condition for later use.
var existingRestartingCondition *commonv1.JobCondition
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
existingRestartingCondition = &commonv1.JobCondition{
Reason: condition.Reason,
Message: condition.Message,
}
}
}

// iterate the replica spec based on this order
allTypes := []commonv1.ReplicaType{
tensorflowv1.TFReplicaTypeChief,
@@ -504,14 +518,15 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
}

if failed > 0 {
restart := false
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
restart = true
// For the situation that jobStatus has a restarting condition, and appends a new running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to append the restarting condition back to jobStatus.
if existingRestartingCondition != nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message)
if err != nil {
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
}

if restart {
// job is restarting, no need to set it failed
// we know it because we update the status condition when reconciling the replicas
distributed := trainingoperatorcommon.MapBoolToDistributed(isDistributed(tfJob))