From c85040aba7cee1c55bca67b1f669077f6c0b92f4 Mon Sep 17 00:00:00 2001
From: yu lin <37265556+Syulin7@users.noreply.github.com>
Date: Wed, 8 Feb 2023 17:52:33 +0800
Subject: [PATCH] Fix the success condition of the job in PyTorchJob's Elastic
 mode. (#1752)

Signed-off-by: Syulin7 <735122171@qq.com>
---
 pkg/controller.v1/pytorch/pytorchjob_controller.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go
index dc6da77441..550a008429 100644
--- a/pkg/controller.v1/pytorch/pytorchjob_controller.go
+++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go
@@ -434,7 +434,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
 		} else {
 			if rtype == kubeflowv1.PyTorchJobReplicaTypeWorker {
 				// TODO(gaocegege): Support SuccessPolicy
-				if expected == 0 {
+				// Leave a succeeded condition for the following two cases:
+				// 1. If all workers are succeeded.
+				// 2. If `ElasticPolicy` is not nil and any worker has completed.
+				if expected == 0 || (pytorchjob.Spec.ElasticPolicy != nil && succeeded > 0) {
 					msg := fmt.Sprintf("PyTorchJob %s/%s successfully completed.",
 						pytorchjob.Namespace, pytorchjob.Name)
 					r.recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobSucceededReason, msg)