From d2ce091e0ccb0367779c1ce3d4199eb4668bd7dc Mon Sep 17 00:00:00 2001 From: Victor Castell Date: Sat, 27 May 2023 17:26:48 +0200 Subject: [PATCH] Do not retry job when unknown status When the gRPC stream is broken due to any network glitch, ExecutionDone is called with a failed state, this triggers the retry call, but the real status of the task is unknown. This change disables retry when the reason for ExecutionDone is ErrBrokenStream. --- dkron/grpc.go | 7 ++++++- dkron/grpc_client.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dkron/grpc.go b/dkron/grpc.go index f1e2e63b1..55ff64f4f 100644 --- a/dkron/grpc.go +++ b/dkron/grpc.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "net" + "strings" "time" metrics "github.com/armon/go-metrics" @@ -205,8 +206,12 @@ func (grpcs *GRPCServer) ExecutionDone(ctx context.Context, execDoneReq *proto.E } // If the execution failed, retry it until retries limit (default: don't retry) + // Don't retry if the status is unknown execution := NewExecutionFromProto(&pbex) - if !execution.Success && uint(execution.Attempt) < job.Retries+1 { + if !execution.Success && + uint(execution.Attempt) < job.Retries+1 && + !strings.HasPrefix(execution.Output, ErrBrokenStream.Error()) { + // Increment the attempt counter execution.Attempt++ // Keep all execution properties intact except the last output diff --git a/dkron/grpc_client.go b/dkron/grpc_client.go index 17e4d23e4..2661b96c2 100644 --- a/dkron/grpc_client.go +++ b/dkron/grpc_client.go @@ -426,7 +426,7 @@ func (grpcc *GRPCClient) AgentRun(addr string, job *proto.Job, execution *proto. if err != nil { // At this point the execution status will be unknown, set the FinishedAt time and an explanatory message execution.FinishedAt = ptypes.TimestampNow() - execution.Output = []byte(err.Error()) + execution.Output = []byte(ErrBrokenStream.Error() + ": " + err.Error()) grpcc.logger.WithError(err).Error(ErrBrokenStream)