Skip to content

Commit

Permalink
Fix transient secret sync error handling (#4310)
Browse files Browse the repository at this point in the history
* Extra logging

Signed-off-by: Thomas Newton <[email protected]>

* Hopefully fix it

Signed-off-by: Thomas Newton <[email protected]>

* Remove debug prints

Signed-off-by: Thomas Newton <[email protected]>

* Reduce duplication

Signed-off-by: Thomas Newton <[email protected]>

* Tidy

Signed-off-by: Thomas Newton <[email protected]>

* Add a test

Signed-off-by: Thomas Newton <[email protected]>

* Use a CreateContainerConfigErrorGracePeriod

Signed-off-by: Thomas Newton <[email protected]>

* Add grace period to error message

Signed-off-by: Thomas Newton <[email protected]>

* Retryable failure with cleanup

Signed-off-by: Thomas Newton <[email protected]>

* Longer grace period

Signed-off-by: Thomas Newton <[email protected]>

* Fix test

Signed-off-by: Thomas Newton <[email protected]>

* Set default grace period to 0 to avoid changing default behaviour

Signed-off-by: Thomas Newton <[email protected]>

* Update comment

Signed-off-by: Thomas Newton <[email protected]>

* Use permanent error with clean up

Signed-off-by: Thomas Newton <[email protected]>

* Don't use cleanup

Signed-off-by: Thomas Newton <[email protected]>

* Fix test

Signed-off-by: Thomas Newton <[email protected]>

---------

Signed-off-by: Thomas Newton <[email protected]>
Co-authored-by: Dan Rammer <[email protected]>
  • Loading branch information
Tom-Newton and hamersaw authored Dec 6, 2023
1 parent c049865 commit 76114aa
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ var (
CreateContainerErrorGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
CreateContainerConfigErrorGracePeriod: config2.Duration{
Duration: time.Minute * 0,
},
ImagePullBackoffGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
Expand Down Expand Up @@ -136,6 +139,11 @@ type K8sPluginConfig struct {
// one, and the corresponding task marked as failed
CreateContainerErrorGracePeriod config2.Duration `json:"create-container-error-grace-period" pflag:"-,Time to wait for transient CreateContainerError errors to be resolved."`

// Time to wait for transient CreateContainerConfigError errors to be resolved. If the
// error persists past this grace period, it will be inferred to be a permanent error.
// The pod will be deleted, and the corresponding task marked as failed.
CreateContainerConfigErrorGracePeriod config2.Duration `json:"create-container-config-error-grace-period" pflag:"-,Time to wait for transient CreateContainerConfigError errors to be resolved."`

// Time to wait for transient ImagePullBackoff errors to be resolved. If the
// error persists past this grace period, it will be inferred to be a permanent
// one, and the corresponding task marked as failed
Expand Down
31 changes: 26 additions & 5 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,9 @@ func DemystifyPending(status v1.PodStatus) (pluginsCore.PhaseInfo, error) {
// approximation of the elapsed time since the last
// transition.
t := c.LastTransitionTime.Time
if time.Since(t) >= config.GetK8sPluginConfig().CreateContainerErrorGracePeriod.Duration {
return pluginsCore.PhaseInfoFailure(finalReason, finalMessage, &pluginsCore.TaskInfo{
gracePeriod := config.GetK8sPluginConfig().CreateContainerErrorGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoFailure(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil
}
Expand All @@ -672,16 +673,32 @@ func DemystifyPending(status v1.PodStatus) (pluginsCore.PhaseInfo, error) {
&pluginsCore.TaskInfo{OccurredAt: &t},
), nil

case "CreateContainerConfigError", "InvalidImageName":
case "CreateContainerConfigError":
t := c.LastTransitionTime.Time
gracePeriod := config.GetK8sPluginConfig().CreateContainerConfigErrorGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoFailure(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil
}
return pluginsCore.PhaseInfoInitializing(
t,
pluginsCore.DefaultPhaseVersion,
fmt.Sprintf("[%s]: %s", finalReason, finalMessage),
&pluginsCore.TaskInfo{OccurredAt: &t},
), nil

case "InvalidImageName":
t := c.LastTransitionTime.Time
return pluginsCore.PhaseInfoFailure(finalReason, finalMessage, &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil

case "ImagePullBackOff":
t := c.LastTransitionTime.Time
if time.Since(t) >= config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration {
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, finalMessage, &pluginsCore.TaskInfo{
gracePeriod := config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil
}
Expand Down Expand Up @@ -715,6 +732,10 @@ func DemystifyPending(status v1.PodStatus) (pluginsCore.PhaseInfo, error) {
return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "Scheduling"), nil
}

func GetMessageAfterGracePeriod(message string, gracePeriod time.Duration) string {
return fmt.Sprintf("Grace period [%s] exceeded|%s", gracePeriod, message)
}

func DemystifySuccess(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCore.PhaseInfo, error) {
for _, status := range append(
append(status.InitContainerStatuses, status.ContainerStatuses...), status.EphemeralContainerStatuses...) {
Expand Down
34 changes: 29 additions & 5 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,9 @@ func TestDemystifyPending(t *testing.T) {
CreateContainerErrorGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
CreateContainerConfigErrorGracePeriod: config1.Duration{
Duration: time.Minute * 4,
},
ImagePullBackoffGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
Expand Down Expand Up @@ -1398,19 +1401,40 @@ func TestDemystifyPending(t *testing.T) {
assert.Equal(t, pluginsCore.PhaseRetryableFailure, taskStatus.Phase())
})

t.Run("CreateContainerConfigError", func(t *testing.T) {
s.ContainerStatuses = []v1.ContainerStatus{
t.Run("CreateContainerConfigErrorWithinGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime = metav1.Now()
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CreateContainerConfigError",
Message: "this an error",
Message: "this is a transient error",
},
},
},
}
taskStatus, err := DemystifyPending(s)
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhaseInitializing, taskStatus.Phase())
})

t.Run("CreateContainerConfigErrorOutsideGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime.Time = metav1.Now().Add(-config.GetK8sPluginConfig().CreateContainerConfigErrorGracePeriod.Duration)
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CreateContainerConfigError",
Message: "this a permanent error",
},
},
},
}
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhasePermanentFailure, taskStatus.Phase())
})
Expand Down Expand Up @@ -1583,7 +1607,7 @@ func TestDemystifyPending_testcases(t *testing.T) {
errCode string
message string
}{
{"ImagePullBackOff", "imagepull-failurepod.json", false, "ContainersNotReady|ImagePullBackOff", "containers with unready status: [fdf98e4ed2b524dc3bf7-get-flyte-id-task-0]|Back-off pulling image \"image\""},
{"ImagePullBackOff", "imagepull-failurepod.json", false, "ContainersNotReady|ImagePullBackOff", "Grace period [3m0s] exceeded|containers with unready status: [fdf98e4ed2b524dc3bf7-get-flyte-id-task-0]|Back-off pulling image \"image\""},
}

for _, tt := range tests {
Expand Down

0 comments on commit 76114aa

Please sign in to comment.