Stop executing retried Tasks past their timeouts

Today, when a `Task` is retried, it executes past its timeout. Even worse, the `Task` executing past its timeout causes the execution of `Finally Tasks` to be timedout before executing for as long as their allocated timeouts. As such, the execution of `Finally Tasks` is not guaranteed even when a `finally timeout` has been allocated. @ornew described the problem in the issue report: "The problem for me is that Tekton still doesn't guarantee the execution of `finally` at all. This is a really serious problem for operating in production, as Tekton we operate has caused a lot of resource leaks and a lot of damage by not running finally. It's the biggest problem with existing `timeout`." This problem is caused by the the interaction between retries and timeouts. When a `TaskRun` is retried: - The status of its failed execution is stored in `RetriesStatus`. - The status of the `TaskRun` is updated, including marking it as `Running` and resetting its `StartTime`. Resetting the `StartTime` is useful in recording the `StartTime` of each retry in the `RetriesStatus`. However, using the reset time as the reset start time of the `TaskRun` when checking whether it has timedout is incorrect and causes the issues described above. In this change, we use the actual start time of the `TaskRun` to check whether the `TaskRun` has timedout. We do this by checking the start time of previous attempts as well, instead of the current attempt only. Alternative approaches considered include: - not resetting the start time of the `TaskRun` upon retry, however it will be challenging to know the execution times of each retry. - keeping track of the actual start time in an extra field, but this is information that's already available in the status References: - [TEP-0046: Finally tasks execution post pipelinerun timeout](https://github.com/tektoncd/community/blob/main/teps/0046-finallytask-execution-post-timeout.md) - [Issue: task time exceeded `timeouts.tasks` when task retried](tektoncd#4071) - [Issue: Allow finally tasks to execute after pipeline timeout](tektoncd#2989) Co-authored-by: Lee Bernick <[email protected]>
jerop · Nov 10, 2021 · 37e06b8 · 37e06b8
1 parent 4db26b1
commit 37e06b8
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 4 deletions.
diff --git a/docs/pipelineruns.md b/docs/pipelineruns.md
@@ -528,6 +528,8 @@ The `timeout` value is a `duration` conforming to Go's
 values are `1h30m`, `1h`, `1m`, and `60s`. If you set the global timeout to 0, all `PipelineRuns`
 that do not have an individual timeout set will fail immediately upon encountering an error.
 
+The above `timeouts` include the time used to execute `retries`. 
+
 ## Monitoring execution status
 
 As your `PipelineRun` executes, its `status` field accumulates information on the execution of each `TaskRun`

diff --git a/docs/pipelines.md b/docs/pipelines.md
@@ -763,7 +763,7 @@ tasks:
 ### Configuring the failure timeout
 
 You can use the `Timeout` field in the `Task` spec within the `Pipeline` to set the timeout
-of the `TaskRun` that executes that `Task` within the `PipelineRun` that executes your `Pipeline.`
+of the `TaskRun` (and its `retries`) that executes that `Task` within the `PipelineRun` that executes your `Pipeline.`
 The `Timeout` value is a `duration` conforming to Go's [`ParseDuration`](https://golang.org/pkg/time/#ParseDuration)
 format. For example, valid values are `1h30m`, `1h`, `1m`, and `60s`.
 

diff --git a/docs/taskruns.md b/docs/taskruns.md
@@ -398,6 +398,8 @@ If a `TaskRun` runs longer than its timeout value, the pod associated with the `
 means that the logs of the `TaskRun` are not preserved. The deletion of the `TaskRun` pod is necessary in order to
 stop `TaskRun` step containers from running.
 
+The above `timeout` is inclusive of the time used to execute the `TaskRun`'s `retries`.
+
 ### Specifying `ServiceAccount` credentials
 
 You can execute the `Task` in your `TaskRun` with a specific set of credentials by

diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types.go b/pkg/apis/pipeline/v1beta1/taskrun_types.go
@@ -385,18 +385,35 @@ func (tr *TaskRun) IsCancelled() bool {
 
 // HasTimedOut returns true if the TaskRun runtime is beyond the allowed timeout
 func (tr *TaskRun) HasTimedOut(ctx context.Context) bool {
-	if tr.Status.StartTime.IsZero() {
+	startTime := tr.getStartTime()
+	if startTime.IsZero() {
 		return false
 	}
 	timeout := tr.GetTimeout(ctx)
 	// If timeout is set to 0 or defaulted to 0, there is no timeout.
 	if timeout == apisconfig.NoTimeoutDuration {
 		return false
 	}
-	runtime := time.Since(tr.Status.StartTime.Time)
+	runtime := time.Since(startTime.Time)
 	return runtime > timeout
 }
 
+func (tr *TaskRun) getStartTime() *metav1.Time {
+	startTime := tr.Status.StartTime
+	if startTime.IsZero() {
+		if len(tr.Status.RetriesStatus) == 0 {
+			return startTime
+		}
+		startTime = &metav1.Time{Time: time.Now()}
+	}
+	for _, retry := range tr.Status.RetriesStatus {
+		if retry.StartTime.Time.Before(startTime.Time) {
+			startTime = retry.StartTime
+		}
+	}
+	return startTime
+}
+
 // GetTimeout returns the timeout for the TaskRun, or the default if not specified
 func (tr *TaskRun) GetTimeout(ctx context.Context) time.Duration {
 	// Use the platform default is no timeout is set

diff --git a/pkg/apis/pipeline/v1beta1/taskrun_types_test.go b/pkg/apis/pipeline/v1beta1/taskrun_types_test.go
@@ -309,12 +309,78 @@ func TestHasTimedOut(t *testing.T) {
 			},
 		},
 		expectedStatus: true,
+	}, {
+		name: "TaskRun timed out with retries",
+		taskRun: &v1beta1.TaskRun{
+			Spec: v1beta1.TaskRunSpec{
+				Timeout: &metav1.Duration{
+					Duration: 10 * time.Second,
+				},
+			},
+			Status: v1beta1.TaskRunStatus{
+				Status: duckv1beta1.Status{
+					Conditions: []apis.Condition{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+					}},
+				},
+				TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+					StartTime: &metav1.Time{Time: time.Now().Add(-5 * time.Second)},
+					RetriesStatus: []v1beta1.TaskRunStatus{{
+						Status: duckv1beta1.Status{
+							Conditions: []apis.Condition{{
+								Type:   apis.ConditionSucceeded,
+								Status: corev1.ConditionFalse,
+							}},
+						},
+						TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+							StartTime:      &metav1.Time{Time: time.Now().Add(-11 * time.Second)},
+							CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)},
+						},
+					}},
+				},
+			},
+		},
+		expectedStatus: true,
+	}, {
+		name: "TaskRun timed out with retries but current execution start time is nil",
+		taskRun: &v1beta1.TaskRun{
+			Spec: v1beta1.TaskRunSpec{
+				Timeout: &metav1.Duration{
+					Duration: 10 * time.Second,
+				},
+			},
+			Status: v1beta1.TaskRunStatus{
+				Status: duckv1beta1.Status{
+					Conditions: []apis.Condition{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+					}},
+				},
+				TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+					StartTime: nil,
+					RetriesStatus: []v1beta1.TaskRunStatus{{
+						Status: duckv1beta1.Status{
+							Conditions: []apis.Condition{{
+								Type:   apis.ConditionSucceeded,
+								Status: corev1.ConditionFalse,
+							}},
+						},
+						TaskRunStatusFields: v1beta1.TaskRunStatusFields{
+							StartTime:      &metav1.Time{Time: time.Now().Add(-11 * time.Second)},
+							CompletionTime: &metav1.Time{Time: time.Now().Add(-6 * time.Second)},
+						},
+					}},
+				},
+			},
+		},
+		expectedStatus: true,
 	}}
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			result := tc.taskRun.HasTimedOut(context.Background())
-			if d := cmp.Diff(result, tc.expectedStatus); d != "" {
+			if d := cmp.Diff(tc.expectedStatus, result); d != "" {
 				t.Fatalf(diff.PrintWantGot(d))
 			}
 		})