From a797a639456e86b65efc63f86e965a0873b0f7c5 Mon Sep 17 00:00:00 2001 From: Federico Stagni Date: Wed, 22 Nov 2023 17:09:11 +0100 Subject: [PATCH] fix: sets jobStatus=Failed/Payload failed iff the job was running --- src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py index a568bc957d3..be86ab2001d 100755 --- a/src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py @@ -32,6 +32,7 @@ from DIRAC.WorkloadManagementSystem.Client.MatcherClient import MatcherClient from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient +from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport from DIRAC.WorkloadManagementSystem.Client import JobStatus @@ -691,7 +692,7 @@ def _checkSubmittedJobs(self): payloadErrors = [] originalJobID = self.jobReport.jobID for jobID, taskID in self.submissionDict.items(): - if not taskID in self.computingElement.taskResults: + if taskID not in self.computingElement.taskResults: continue result = self.computingElement.taskResults[taskID] @@ -714,7 +715,12 @@ def _checkSubmittedJobs(self): # The payload failed (if result["Value"] is not 0) elif result["Value"]: - self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed") + # In order to avoid overriding perfectly valid states, the status is updated iff the job was running + res = JobMonitoringClient().getJobsStatus(jobID) + if not res["OK"]: + return res + if res["Value"][jobID]["Status"] == JobStatus.RUNNING: + self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed") # Do not keep running and do not overwrite the Payload error message = f"Payload execution failed with error code {result['Value']}"