Skip to content

Commit

Permalink
chore(controller): ignore k8s job fail reason and refine task finish …
Browse files Browse the repository at this point in the history
…time
  • Loading branch information
jialeicui committed Oct 27, 2023
1 parent 6114b4e commit bc9625d
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,23 +139,16 @@ private void reportRunStatus(V1Job job) {
log.warn("no pod stop time found for job {}, use now", jobName);
}

// prefer using the pod failed reason, it has more details
// use the pod failed reason, it has more details
// we have disabled the job retry, so the job failed reason is not useful
var podFailedReason = getPodFailedReason(jobName);

var failedReason = "";
if (StringUtils.hasText(jobFailedReason)) {
failedReason = "job failed: " + jobFailedReason;
}
if (StringUtils.hasText(podFailedReason)) {
failedReason = failedReason + "\npod failed: " + podFailedReason;
}

var report = ReportedRun.builder()
.id(runId)
.status(runStatus)
.startTimeMillis(startTime)
.stopTimeMillis(stopTime)
.failedReason(StringUtils.hasText(failedReason) ? failedReason : null)
.failedReason(StringUtils.hasText(podFailedReason) ? podFailedReason : null)
.build();
runReportReceiver.receive(report);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ public void onRunUpdate(Run run) {
retryNum = null == retryNum ? 0 : retryNum;
Integer userRetryLimit = task.getStep().getSpec().getBackoffLimit();
Integer backOffLimit = userRetryLimit == null ? this.backOffLimit : userRetryLimit;

var updateFinishedTime = true;
if (
run.getStatus() == RunStatus.FAILED
&& retryNum < backOffLimit
Expand All @@ -108,6 +110,8 @@ public void onRunUpdate(Run run) {
Task ot = ((WatchableTask) task).unwrap();
ot.updateStatus(TaskStatus.READY);
}
// do not update finished time for retrying task
updateFinishedTime = false;
} else {
taskNewStatus = taskStatusMachine.transfer(task.getStatus(), run.getStatus());
}
Expand All @@ -118,7 +122,7 @@ public void onRunUpdate(Run run) {
if (run.getStartTime() != null) {
taskMapper.updateTaskStartedTimeIfNotSet(run.getTaskId(), new Date(run.getStartTime()));
}
if (run.getFinishTime() != null) {
if (run.getFinishTime() != null && updateFinishedTime) {
taskMapper.updateTaskFinishedTimeIfNotSet(run.getTaskId(), new Date(run.getFinishTime()));
}
if (run.getIp() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
Expand All @@ -32,6 +33,7 @@
import ai.starwhale.mlops.domain.task.status.TaskStatus;
import ai.starwhale.mlops.domain.task.status.TaskStatusMachine;
import ai.starwhale.mlops.schedule.SwTaskScheduler;
import java.util.Date;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
Expand Down Expand Up @@ -103,4 +105,32 @@ public void testOnUpdate() {

}

@Test
public void testFinishTime() {
var task = Task.builder()
.retryNum(0)
.step(Step.builder().spec(StepSpec.builder().backoffLimit(2).build()).build())
.currentRun(Run.builder().id(1L).status(RunStatus.RUNNING).build())
.build();
when(hotJobHolder.taskWithId(1L)).thenReturn(task);
var reportdRun = Run.builder()
.id(1L)
.taskId(1L)
.status(RunStatus.FAILED)
.finishTime(42L)
.ip("1.1.1.1")
.build();
runReportReceiver.onRunUpdate(reportdRun);
verify(this.taskMapper, never()).updateTaskFinishedTimeIfNotSet(any(), any());

// fail again
task.getCurrentRun().setStatus(RunStatus.RUNNING);
runReportReceiver.onRunUpdate(reportdRun);
verify(this.taskMapper, never()).updateTaskFinishedTimeIfNotSet(any(), any());

// the last attempt
task.getCurrentRun().setStatus(RunStatus.RUNNING);
runReportReceiver.onRunUpdate(reportdRun);
verify(this.taskMapper, times(1)).updateTaskFinishedTimeIfNotSet(1L, new Date(42L));
}
}

0 comments on commit bc9625d

Please sign in to comment.