This repository has been archived by the owner on Oct 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 59
/
handler.go
331 lines (301 loc) · 14.1 KB
/
handler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
package controller
import (
"context"
"fmt"
"reflect"
"runtime/debug"
"time"
"github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core"
"github.com/flyteorg/flytestdlib/contextutils"
"github.com/flyteorg/flytestdlib/promutils/labeled"
eventsErr "github.com/flyteorg/flytepropeller/events/errors"
"github.com/flyteorg/flytepropeller/pkg/apis/flyteworkflow/v1alpha1"
"github.com/flyteorg/flytepropeller/pkg/controller/config"
"github.com/flyteorg/flytepropeller/pkg/controller/workflowstore"
"github.com/flyteorg/flytestdlib/logger"
"github.com/flyteorg/flytestdlib/promutils"
"github.com/prometheus/client_golang/prometheus"
"github.com/flyteorg/flytepropeller/pkg/controller/executors"
)
// TODO Lets move everything to use controller runtime
type propellerMetrics struct {
Scope promutils.Scope
DeepCopyTime promutils.StopWatch
RawWorkflowTraversalTime labeled.StopWatch
SystemError labeled.Counter
AbortError labeled.Counter
PanicObserved labeled.Counter
RoundSkipped prometheus.Counter
WorkflowNotFound prometheus.Counter
StreakLength labeled.Counter
RoundTime labeled.StopWatch
}
func newPropellerMetrics(scope promutils.Scope) *propellerMetrics {
roundScope := scope.NewSubScope("round")
return &propellerMetrics{
Scope: scope,
DeepCopyTime: roundScope.MustNewStopWatch("deepcopy", "Total time to deep copy wf object", time.Millisecond),
RawWorkflowTraversalTime: labeled.NewStopWatch("raw", "Total time to traverse the workflow", time.Millisecond, roundScope, labeled.EmitUnlabeledMetric),
SystemError: labeled.NewCounter("system_error", "Failure to reconcile a workflow, system error", roundScope, labeled.EmitUnlabeledMetric),
AbortError: labeled.NewCounter("abort_error", "Failure to abort a workflow, system error", roundScope, labeled.EmitUnlabeledMetric),
PanicObserved: labeled.NewCounter("panic", "Panic during handling or aborting workflow", roundScope, labeled.EmitUnlabeledMetric),
RoundSkipped: roundScope.MustNewCounter("skipped", "Round Skipped because of stale workflow"),
WorkflowNotFound: roundScope.MustNewCounter("not_found", "workflow not found in the cache"),
StreakLength: labeled.NewCounter("streak_length", "Number of consecutive rounds used in fast follow mode", roundScope, labeled.EmitUnlabeledMetric),
RoundTime: labeled.NewStopWatch("round_time", "Total time taken by one round traversing, copying and storing a workflow", time.Millisecond, roundScope, labeled.EmitUnlabeledMetric),
}
}
// Helper method to record system error in the workflow.
func RecordSystemError(w *v1alpha1.FlyteWorkflow, err error) *v1alpha1.FlyteWorkflow {
// Let's mark these as system errors.
// We only want to increase failed attempts and discard any other partial changes to the CRD.
wfDeepCopy := w.DeepCopy()
wfDeepCopy.GetExecutionStatus().IncFailedAttempts()
wfDeepCopy.GetExecutionStatus().SetMessage(err.Error())
return wfDeepCopy
}
// Core Propeller structure that houses the Reconciliation loop for Flytepropeller
type Propeller struct {
wfStore workflowstore.FlyteWorkflow
workflowExecutor executors.Workflow
metrics *propellerMetrics
cfg *config.Config
}
// Initializes all downstream executors
func (p *Propeller) Initialize(ctx context.Context) error {
return p.workflowExecutor.Initialize(ctx)
}
// TryMutateWorkflow will try to mutate the workflow by traversing it and reconciling the desired and actual state.
// The desired state here is the entire workflow is completed, actual state is each nodes current execution state.
func (p *Propeller) TryMutateWorkflow(ctx context.Context, originalW *v1alpha1.FlyteWorkflow) (*v1alpha1.FlyteWorkflow, error) {
t := p.metrics.DeepCopyTime.Start()
mutableW := originalW.DeepCopy()
t.Stop()
ctx = contextutils.WithWorkflowID(ctx, mutableW.GetID())
if execID := mutableW.GetExecutionID(); execID.WorkflowExecutionIdentifier != nil {
ctx = contextutils.WithProjectDomain(ctx, mutableW.GetExecutionID().Project, mutableW.GetExecutionID().Domain)
}
ctx = contextutils.WithResourceVersion(ctx, mutableW.GetResourceVersion())
maxRetries := uint32(p.cfg.MaxWorkflowRetries)
if IsDeleted(mutableW) || (mutableW.Status.FailedAttempts > maxRetries) {
var err error
func() {
defer func() {
if r := recover(); r != nil {
stack := debug.Stack()
err = fmt.Errorf("panic when aborting workflow, Stack: [%s]", string(stack))
logger.Errorf(ctx, err.Error())
p.metrics.PanicObserved.Inc(ctx)
}
}()
err = p.workflowExecutor.HandleAbortedWorkflow(ctx, mutableW, maxRetries)
}()
if err != nil {
p.metrics.AbortError.Inc(ctx)
return nil, err
}
return mutableW, nil
}
if !mutableW.GetExecutionStatus().IsTerminated() {
var err error
SetFinalizerIfEmpty(mutableW, FinalizerKey)
func() {
t := p.metrics.RawWorkflowTraversalTime.Start(ctx)
defer func() {
t.Stop()
if r := recover(); r != nil {
stack := debug.Stack()
err = fmt.Errorf("panic when reconciling workflow, Stack: [%s]", string(stack))
logger.Errorf(ctx, err.Error())
p.metrics.PanicObserved.Inc(ctx)
}
}()
err = p.workflowExecutor.HandleFlyteWorkflow(ctx, mutableW)
}()
if err != nil {
logger.Errorf(ctx, "Error when trying to reconcile workflow. Error [%v]. Error Type[%v]",
err, reflect.TypeOf(err))
p.metrics.SystemError.Inc(ctx)
return nil, err
}
} else {
logger.Warn(ctx, "Workflow is marked as terminated but doesn't have the completed label, marking it as completed.")
}
return mutableW, nil
}
// Handle method is the entry point for the reconciler.
// It compares the actual state with the desired, and attempts to
// converge the two. It then updates the GetExecutionStatus block of the FlyteWorkflow resource
// with the current status of the resource.
// Every FlyteWorkflow transitions through the following
//
// The Workflow to be worked on is identified for the given namespace and executionID (which is the name of the workflow)
// The return value should be an error, in the case, we wish to retry this workflow
// <pre>
//
// +--------+ +---------+ +------------+ +---------+
// | | | | | | | |
// | Ready +--------> Running +--------> Succeeding +-----> Success |
// | | | | | | | |
// +--------+ +---------+ +------------+ +---------+
// | |
// | |
// | +----v----+ +---------------------+ +--------+
// | | | | (optional) | | |
// +-------------> Failing +--------> HandlingFailureNode +--------> Failed |
// | | | | | |
// +---------+ +---------------------+ +--------+
// </pre>
func (p *Propeller) Handle(ctx context.Context, namespace, name string) error {
logger.Infof(ctx, "Processing Workflow.")
defer logger.Infof(ctx, "Completed processing workflow.")
// Get the FlyteWorkflow resource with this namespace/name
w, fetchErr := p.wfStore.Get(ctx, namespace, name)
if fetchErr != nil {
if workflowstore.IsNotFound(fetchErr) {
p.metrics.WorkflowNotFound.Inc()
logger.Warningf(ctx, "Workflow namespace[%v]/name[%v] not found, may be deleted.", namespace, name)
return nil
}
if workflowstore.IsWorkflowStale(fetchErr) {
p.metrics.RoundSkipped.Inc()
logger.Warningf(ctx, "Workflow namespace[%v]/name[%v] Stale.", namespace, name)
return nil
}
logger.Warningf(ctx, "Failed to GetWorkflow, retrying with back-off", fetchErr)
return fetchErr
}
if w.GetExecutionStatus().IsTerminated() {
if HasCompletedLabel(w) && !HasFinalizer(w) {
logger.Debugf(ctx, "Workflow is terminated.")
// This workflow had previously completed, let us ignore it
return nil
}
}
streak := 0
defer p.metrics.StreakLength.Add(ctx, float64(streak))
maxLength := p.cfg.MaxStreakLength
if maxLength <= 0 {
maxLength = 1
}
for streak = 0; streak < maxLength; streak++ {
t := p.metrics.RoundTime.Start(ctx)
mutatedWf, err := p.TryMutateWorkflow(ctx, w)
if err != nil {
// NOTE We are overriding the deepcopy here, as we are essentially ingnoring all mutations
// We only want to increase failed attempts and discard any other partial changes to the CRD.
mutatedWf = RecordSystemError(w, err)
p.metrics.SystemError.Inc(ctx)
} else if mutatedWf == nil {
logger.Errorf(ctx, "Should not happen! Mutation resulted in a nil workflow!")
return nil
} else {
if !w.GetExecutionStatus().IsTerminated() {
// No updates in the status we detected, we will skip writing to KubeAPI
if mutatedWf.Status.Equals(&w.Status) {
logger.Info(ctx, "WF hasn't been updated in this round.")
t.Stop()
return nil
}
}
if mutatedWf.GetExecutionStatus().IsTerminated() {
// If the end result is a terminated workflow, we remove the labels
// We add a completed label so that we can avoid polling for this workflow
SetCompletedLabel(mutatedWf, time.Now())
ResetFinalizers(mutatedWf)
}
}
// ExecutionNotFound error is returned when flyteadmin is missing the workflow. This is not
// a valid state unless we are experiencing a race condition where the workflow has not yet
// been inserted into the db (ie. workflow phase is WorkflowPhaseReady).
if err != nil && eventsErr.IsNotFound(err) && w.GetExecutionStatus().GetPhase() != v1alpha1.WorkflowPhaseReady {
t.Stop()
logger.Errorf(ctx, "Failed to process workflow, failing: %s", err)
// We set the workflow status to failing to abort any active tasks in the next round.
mutableW := w.DeepCopy()
mutableW.Status.UpdatePhase(v1alpha1.WorkflowPhaseFailing, "Workflow execution is missing in flyteadmin, aborting", &core.ExecutionError{
Kind: core.ExecutionError_SYSTEM,
Code: "ExecutionNotFound",
Message: "Workflow execution not found in flyteadmin.",
})
if _, e := p.wfStore.Update(ctx, mutableW, workflowstore.PriorityClassCritical); e != nil {
logger.Errorf(ctx, "Failed to record an ExecutionNotFound workflow as failed, reason: %s. Retrying...", e)
return e
}
return nil
}
// Incompatible cluster means that another cluster has been designated to handle this workflow execution.
// We should early abort in this case, since any events originating from this cluster for this execution will
// be rejected.
if err != nil && eventsErr.IsEventIncompatibleClusterError(err) {
t.Stop()
logger.Errorf(ctx, "No longer designated to process workflow, failing: %s", err)
// We set the workflow status to failing to abort any active tasks in the next round.
mutableW := w.DeepCopy()
mutableW.Status.UpdatePhase(v1alpha1.WorkflowPhaseFailing, "Workflow execution cluster reassigned, aborting", &core.ExecutionError{
Kind: core.ExecutionError_SYSTEM,
Code: string(eventsErr.EventIncompatibleCusterError),
Message: fmt.Sprintf("Workflow execution cluster reassigned: %v", err),
})
if _, e := p.wfStore.Update(ctx, mutableW, workflowstore.PriorityClassCritical); e != nil {
logger.Errorf(ctx, "Failed to record an EventIncompatibleClusterError workflow as failed, reason: %s. Retrying...", e)
return e
}
return nil
}
// TODO we will need to call updatestatus when it is supported. But to preserve metadata like (label/finalizer) we will need to use update
// update the GetExecutionStatus block of the FlyteWorkflow resource. UpdateStatus will not
// allow changes to the Spec of the resource, which is ideal for ensuring
// nothing other than resource status has been updated.
newWf, updateErr := p.wfStore.Update(ctx, mutatedWf, workflowstore.PriorityClassCritical)
if updateErr != nil {
t.Stop()
// The update has failed, lets check if this is because the size is too large. If so
if workflowstore.IsWorkflowTooLarge(updateErr) {
logger.Errorf(ctx, "Failed storing workflow to the store, reason: %s", updateErr)
p.metrics.SystemError.Inc(ctx)
// Workflow is too large, we will mark the workflow as failing and record it. This will automatically
// propagate the failure in the next round.
mutableW := w.DeepCopy()
mutableW.Status.UpdatePhase(v1alpha1.WorkflowPhaseFailing, "Workflow size has breached threshold, aborting", &core.ExecutionError{
Kind: core.ExecutionError_SYSTEM,
Code: "WorkflowTooLarge",
Message: "Workflow execution state is too large for Flyte to handle.",
})
if _, e := p.wfStore.Update(ctx, mutableW, workflowstore.PriorityClassCritical); e != nil {
logger.Errorf(ctx, "Failed recording a large workflow as failed, reason: %s. Retrying...", e)
return e
}
return nil
}
return updateErr
}
if err != nil {
t.Stop()
// An error was encountered during the round. Let us return, so that we can back-off gracefully
return err
}
if mutatedWf.GetExecutionStatus().IsTerminated() || newWf.ResourceVersion == mutatedWf.ResourceVersion {
// Workflow is terminated (no need to continue) or no status was changed, we can wait
logger.Infof(ctx, "Will not fast follow, Reason: Wf terminated? %v, Version matched? %v",
mutatedWf.GetExecutionStatus().IsTerminated(), newWf.ResourceVersion == mutatedWf.ResourceVersion)
t.Stop()
return nil
}
logger.Infof(ctx, "FastFollow Enabled. Detected State change, we will try another round. StreakLength [%d]", streak)
w = newWf
t.Stop()
}
logger.Infof(ctx, "Streak ended at [%d]/Max: [%d]", streak, maxLength)
return nil
}
// NewPropellerHandler creates a new Propeller and initializes metrics
func NewPropellerHandler(_ context.Context, cfg *config.Config, wfStore workflowstore.FlyteWorkflow, executor executors.Workflow, scope promutils.Scope) *Propeller {
metrics := newPropellerMetrics(scope)
return &Propeller{
metrics: metrics,
wfStore: wfStore,
workflowExecutor: executor,
cfg: cfg,
}
}