From c85040aba7cee1c55bca67b1f669077f6c0b92f4 Mon Sep 17 00:00:00 2001 From: yu lin <37265556+Syulin7@users.noreply.github.com> Date: Wed, 8 Feb 2023 17:52:33 +0800 Subject: [PATCH] Fix the success condition of the job in PyTorchJob's Elastic mode. (#1752) Signed-off-by: Syulin7 <735122171@qq.com> --- pkg/controller.v1/pytorch/pytorchjob_controller.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index dc6da77441..550a008429 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -434,7 +434,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, } else { if rtype == kubeflowv1.PyTorchJobReplicaTypeWorker { // TODO(gaocegege): Support SuccessPolicy - if expected == 0 { + // Leave a succeeded condition for the following two cases: + // 1. If all workers are succeeded. + // 2. If `ElasticPolicy` is not nil and any worker has completed. + if expected == 0 || (pytorchjob.Spec.ElasticPolicy != nil && succeeded > 0) { msg := fmt.Sprintf("PyTorchJob %s/%s successfully completed.", pytorchjob.Namespace, pytorchjob.Name) r.recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobSucceededReason, msg)