-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: introduce cron job metrics #2256
Changes from 2 commits
277eac8
f44c0c7
61dafb6
1eb564b
390fe34
820f7d8
a949bf1
2f09321
e2befca
c4445db
6bf334d
f94a358
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package observability | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"go.opentelemetry.io/otel" | ||
"go.opentelemetry.io/otel/attribute" | ||
"go.opentelemetry.io/otel/metric" | ||
|
||
"github.com/TBD54566975/ftl/internal/model" | ||
"github.com/TBD54566975/ftl/internal/observability" | ||
) | ||
|
||
const ( | ||
cronMeterName = "ftl.cron" | ||
cronJobRefAttribute = "ftl.cron.job.ref" | ||
) | ||
|
||
type CronMetrics struct { | ||
jobFailures metric.Int64Counter | ||
jobsKilled metric.Int64Counter | ||
jobsActive metric.Int64UpDownCounter | ||
jobLatency metric.Int64Histogram | ||
} | ||
|
||
func initCronMetrics() (*CronMetrics, error) { | ||
result := &CronMetrics{} | ||
|
||
var errs error | ||
var err error | ||
|
||
meter := otel.Meter(deploymentMeterName) | ||
|
||
counter := fmt.Sprintf("%s.job.failures", cronMeterName) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/failures/completed/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😅 thanks for spotting that, updated |
||
if result.jobFailures, err = meter.Int64Counter( | ||
counter, | ||
metric.WithDescription("the number of failures encountered while performing activities associated with starting or ending a cron job")); err != nil { | ||
result.jobFailures, errs = handleInt64CounterError(counter, err, errs) | ||
} | ||
|
||
counter = fmt.Sprintf("%s.jobs.kills", cronMeterName) | ||
if result.jobsKilled, err = meter.Int64Counter( | ||
counter, | ||
metric.WithDescription("the number cron jobs killed by the controller")); err != nil { | ||
result.jobsKilled, errs = handleInt64CounterError(counter, err, errs) | ||
} | ||
|
||
counter = fmt.Sprintf("%s.jobs.active", cronMeterName) | ||
if result.jobsActive, err = meter.Int64UpDownCounter( | ||
counter, | ||
metric.WithDescription("the number of actively executing cron jobs")); err != nil { | ||
result.jobsActive, errs = handleInt64UpDownCounterError(counter, err, errs) | ||
} | ||
|
||
counter = fmt.Sprintf("%s.job.latency", cronMeterName) | ||
if result.jobLatency, err = meter.Int64Histogram( | ||
counter, | ||
metric.WithDescription("the latency between the scheduled execution time and completion of a cron job"), | ||
metric.WithUnit("ms")); err != nil { | ||
result.jobLatency, errs = handleInt64HistogramCounterError(counter, err, errs) | ||
} | ||
|
||
return result, errs | ||
} | ||
|
||
func (m *CronMetrics) JobExecutionStarted(ctx context.Context, job model.CronJobKey, deployment model.DeploymentKey) { | ||
m.jobsActive.Add(ctx, 1, metric.WithAttributes( | ||
attribute.String(observability.ModuleNameAttribute, job.Payload.Module), | ||
attribute.String(cronJobRefAttribute, job.String()), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just confirming - is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the module qualified verb name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahhh in that case should we rename the attribute key to match? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
attribute.String(observability.RunnerDeploymentKeyAttribute, deployment.String()), | ||
)) | ||
} | ||
|
||
func (m *CronMetrics) JobExecutionCompleted(ctx context.Context, job model.CronJobKey, deployment model.DeploymentKey, scheduled time.Time) { | ||
elapsed := time.Since(scheduled) | ||
|
||
m.jobsActive.Add(ctx, -1, metric.WithAttributes( | ||
attribute.String(observability.ModuleNameAttribute, job.Payload.Module), | ||
attribute.String(cronJobRefAttribute, job.String()), | ||
attribute.String(observability.RunnerDeploymentKeyAttribute, deployment.String()), | ||
)) | ||
|
||
m.jobLatency.Record(ctx, elapsed.Milliseconds(), metric.WithAttributes( | ||
attribute.String(observability.ModuleNameAttribute, job.Payload.Module), | ||
attribute.String(cronJobRefAttribute, job.String()), | ||
attribute.String(observability.RunnerDeploymentKeyAttribute, deployment.String()), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since these attributes are identical to the ones above for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
)) | ||
} | ||
|
||
func (m *CronMetrics) JobKilled(ctx context.Context, job model.CronJobKey, deployment model.DeploymentKey) { | ||
m.jobsActive.Add(ctx, -1, metric.WithAttributes( | ||
attribute.String(observability.ModuleNameAttribute, job.Payload.Module), | ||
attribute.String(cronJobRefAttribute, job.String()), | ||
attribute.String(observability.RunnerDeploymentKeyAttribute, deployment.String()), | ||
)) | ||
} | ||
|
||
func (m *CronMetrics) JobFailed(ctx context.Context, job model.CronJobKey, deployment model.DeploymentKey) { | ||
m.jobFailures.Add(ctx, 1, metric.WithAttributes( | ||
attribute.String(observability.ModuleNameAttribute, job.Payload.Module), | ||
attribute.String(cronJobRefAttribute, job.String()), | ||
attribute.String(observability.RunnerDeploymentKeyAttribute, deployment.String()), | ||
)) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of bundling all the failures to kill a job with failures of the actual job itself, could we refactor this to the following?
.succeeded
.succeeded
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like the idea and would like to take it a little further. Removing the
jobFailures
counter does result in a loss of visibility that can be avoided if theftl.status.succeeded
attribute can take on a more generalized semantic. Changing from success/fail to "outcome" would help. Outcome is a string representing the canonical set of outcomes for a given domain. So in the metrics where theftl.status.succeed
is used today; outcome would have the values ofsucceeded
orfailed
.For cron jobs I would like outcomes covering the following: success, failed to start, execution failure, and killed. This would allow me to remove the
jobsKilled
metric and segment latency by outcomeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated