From 8fcbd33410fd3cc41d23e4c54a302d0b40dd8f18 Mon Sep 17 00:00:00 2001 From: Denise Li Date: Tue, 6 Aug 2024 15:39:32 -0400 Subject: [PATCH] feat: bucketize latency attribute (#2269) This PR bucketizes the latency _attribute_ (NOT the latency _metric_! That would be very bad!) For better filtering capabilities in datadog, we need to bucket our high-cardinality numeric attributes. This uses log base8 because: 1. That seems to give the most useful results for current production numbers we're seeing in DD - good functional granularity for analysis purposes without being so granular that it's annoying to sort through the buckets. Current values range from 62 to 20279, concentrated in the 100-1000 range. 2. Base 8 bucket labels are all powers of 2, so it's relatively easy for our brains to skim Sample output: ``` -> ftl.async_call.time_since_scheduled_at_ms.bucket: Str([8,64)) ``` Issue: https://github.com/TBD54566975/ftl/issues/2194 --- .../controller/observability/async_calls.go | 16 +++--- .../controller/observability/observability.go | 18 +++++++ .../observability/observability_test.go | 53 +++++++++++++++++++ 3 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 backend/controller/observability/observability_test.go diff --git a/backend/controller/observability/async_calls.go b/backend/controller/observability/async_calls.go index 9ccf666d17..2b955dec01 100644 --- a/backend/controller/observability/async_calls.go +++ b/backend/controller/observability/async_calls.go @@ -16,12 +16,12 @@ import ( ) const ( - asyncCallMeterName = "ftl.async_call" - asyncCallOriginAttr = "ftl.async_call.origin" - asyncCallVerbRefAttr = "ftl.async_call.verb.ref" - asyncCallTimeSinceScheduledAtAttr = "ftl.async_call.time_since_scheduled_at_ms" - asyncCallRemainingAttemptsAttr = "ftl.async_call.remaining_attempts" - asyncCallExecFailureModeAttr = "ftl.async_call.execution.failure_mode" + asyncCallMeterName = "ftl.async_call" + asyncCallOriginAttr = "ftl.async_call.origin" + asyncCallVerbRefAttr = "ftl.async_call.verb.ref" + asyncCallTimeSinceScheduledAtBucketAttr = "ftl.async_call.time_since_scheduled_at_ms.bucket" + asyncCallRemainingAttemptsAttr = "ftl.async_call.remaining_attempts" + asyncCallExecFailureModeAttr = "ftl.async_call.execution.failure_mode" ) type AsyncCallMetrics struct { @@ -126,14 +126,14 @@ func (m *AsyncCallMetrics) Completed(ctx context.Context, verb schema.RefKey, or attrs = append(attrs, attribute.Bool(observability.StatusSucceededAttribute, maybeErr == nil)) m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...)) - attrs = append(attrs, attribute.Int64(asyncCallTimeSinceScheduledAtAttr, msToComplete)) + attrs = append(attrs, attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, msToComplete))) m.completed.Add(ctx, 1, metric.WithAttributes(attrs...)) m.queueDepth.Record(ctx, queueDepth) } func extractAsyncCallAttrs(verb schema.RefKey, origin string, scheduledAt time.Time) []attribute.KeyValue { - return append(extractRefAttrs(verb, origin), attribute.Int64(asyncCallTimeSinceScheduledAtAttr, timeSinceMS(scheduledAt))) + return append(extractRefAttrs(verb, origin), attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, timeSinceMS(scheduledAt)))) } func extractRefAttrs(verb schema.RefKey, origin string) []attribute.KeyValue { diff --git a/backend/controller/observability/observability.go b/backend/controller/observability/observability.go index 1d31d42742..138f4dc757 100644 --- a/backend/controller/observability/observability.go +++ b/backend/controller/observability/observability.go @@ -3,6 +3,7 @@ package observability import ( "errors" "fmt" + "math" "time" "go.opentelemetry.io/otel/metric" @@ -47,3 +48,20 @@ func handleInt64UpDownCounterError(counter string, err error, errs error) (metri func timeSinceMS(start time.Time) int64 { return time.Since(start).Milliseconds() } + +// logBucket returns a string bucket label for a given positive number bucketed into +// powers of some arbitrary base. For base 8, for example, we would have buckets: +// +// <1, [1-8), [8-64), [64-512), etc. +// +// Go only supports a few bases with math.Log*, so this func performs a change of base: +// log_b(x) = log_a(x) / log_a(b) +func logBucket(base int, num int64) string { + if num < 1 { + return "<1" + } + b := float64(base) + logB := math.Log(float64(num)) / math.Log(b) + bucketExpLo := math.Floor(logB) + return fmt.Sprintf("[%d,%d)", int(math.Pow(b, bucketExpLo)), int(math.Pow(b, bucketExpLo+1))) +} diff --git a/backend/controller/observability/observability_test.go b/backend/controller/observability/observability_test.go new file mode 100644 index 0000000000..70f2dec9f5 --- /dev/null +++ b/backend/controller/observability/observability_test.go @@ -0,0 +1,53 @@ +package observability + +import ( + "testing" + + "github.com/alecthomas/assert/v2" +) + +func TestLogBucket(t *testing.T) { + tests := []struct { + name string + base int + num int + want string + }{ + { + name: "<1", + base: 8, + num: 0, + want: "<1", + }, + { + name: "EqualLowEndOfRange", + base: 8, + num: 1, + want: "[1,8)", + }, + { + name: "HigherEndOfRange", + base: 8, + num: 7, + want: "[1,8)", + }, + { + name: "BigInputNum", + base: 8, + num: 16800000, + want: "[16777216,134217728)", + }, + { + name: "Base2", + base: 2, + num: 8, + want: "[8,16)", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.want, logBucket(test.base, int64(test.num))) + }) + } +}