From 05bc5480af980dca495ba1499a20d1b58adee1d4 Mon Sep 17 00:00:00 2001 From: Syulin7 <735122171@qq.com> Date: Tue, 7 Feb 2023 23:16:24 +0800 Subject: [PATCH] bug fix: pytorchjob success condition Signed-off-by: Syulin7 <735122171@qq.com> --- pkg/controller.v1/pytorch/pytorchjob_controller.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index dc6da77441..550a008429 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -434,7 +434,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, } else { if rtype == kubeflowv1.PyTorchJobReplicaTypeWorker { // TODO(gaocegege): Support SuccessPolicy - if expected == 0 { + // Leave a succeeded condition for the following two cases: + // 1. If all workers are succeeded. + // 2. If `ElasticPolicy` is not nil and any worker has completed. + if expected == 0 || (pytorchjob.Spec.ElasticPolicy != nil && succeeded > 0) { msg := fmt.Sprintf("PyTorchJob %s/%s successfully completed.", pytorchjob.Namespace, pytorchjob.Name) r.recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobSucceededReason, msg)