From 8fcbd33410fd3cc41d23e4c54a302d0b40dd8f18 Mon Sep 17 00:00:00 2001
From: Denise Li <dli@squareup.com>
Date: Tue, 6 Aug 2024 15:39:32 -0400
Subject: [PATCH] feat: bucketize latency attribute (#2269)

This PR bucketizes the latency _attribute_ (NOT the latency _metric_!
That would be very bad!)

For better filtering capabilities in datadog, we need to bucket our
high-cardinality numeric attributes. This uses log base8 because:

1. That seems to give the most useful results for current production
numbers we're seeing in DD - good functional granularity for analysis
purposes without being so granular that it's annoying to sort through
the buckets. Current values range from 62 to 20279, concentrated in the
100-1000 range.
2. Base 8 bucket labels are all powers of 2, so it's relatively easy for
our brains to skim

Sample output:
```
-> ftl.async_call.time_since_scheduled_at_ms.bucket: Str([8,64))
```

Issue: https://github.com/TBD54566975/ftl/issues/2194
---
 .../controller/observability/async_calls.go   | 16 +++---
 .../controller/observability/observability.go | 18 +++++++
 .../observability/observability_test.go       | 53 +++++++++++++++++++
 3 files changed, 79 insertions(+), 8 deletions(-)
 create mode 100644 backend/controller/observability/observability_test.go

diff --git a/backend/controller/observability/async_calls.go b/backend/controller/observability/async_calls.go
index 9ccf666d17..2b955dec01 100644
--- a/backend/controller/observability/async_calls.go
+++ b/backend/controller/observability/async_calls.go
@@ -16,12 +16,12 @@ import (
 )
 
 const (
-	asyncCallMeterName                = "ftl.async_call"
-	asyncCallOriginAttr               = "ftl.async_call.origin"
-	asyncCallVerbRefAttr              = "ftl.async_call.verb.ref"
-	asyncCallTimeSinceScheduledAtAttr = "ftl.async_call.time_since_scheduled_at_ms"
-	asyncCallRemainingAttemptsAttr    = "ftl.async_call.remaining_attempts"
-	asyncCallExecFailureModeAttr      = "ftl.async_call.execution.failure_mode"
+	asyncCallMeterName                      = "ftl.async_call"
+	asyncCallOriginAttr                     = "ftl.async_call.origin"
+	asyncCallVerbRefAttr                    = "ftl.async_call.verb.ref"
+	asyncCallTimeSinceScheduledAtBucketAttr = "ftl.async_call.time_since_scheduled_at_ms.bucket"
+	asyncCallRemainingAttemptsAttr          = "ftl.async_call.remaining_attempts"
+	asyncCallExecFailureModeAttr            = "ftl.async_call.execution.failure_mode"
 )
 
 type AsyncCallMetrics struct {
@@ -126,14 +126,14 @@ func (m *AsyncCallMetrics) Completed(ctx context.Context, verb schema.RefKey, or
 	attrs = append(attrs, attribute.Bool(observability.StatusSucceededAttribute, maybeErr == nil))
 	m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...))
 
-	attrs = append(attrs, attribute.Int64(asyncCallTimeSinceScheduledAtAttr, msToComplete))
+	attrs = append(attrs, attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, msToComplete)))
 	m.completed.Add(ctx, 1, metric.WithAttributes(attrs...))
 
 	m.queueDepth.Record(ctx, queueDepth)
 }
 
 func extractAsyncCallAttrs(verb schema.RefKey, origin string, scheduledAt time.Time) []attribute.KeyValue {
-	return append(extractRefAttrs(verb, origin), attribute.Int64(asyncCallTimeSinceScheduledAtAttr, timeSinceMS(scheduledAt)))
+	return append(extractRefAttrs(verb, origin), attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, timeSinceMS(scheduledAt))))
 }
 
 func extractRefAttrs(verb schema.RefKey, origin string) []attribute.KeyValue {
diff --git a/backend/controller/observability/observability.go b/backend/controller/observability/observability.go
index 1d31d42742..138f4dc757 100644
--- a/backend/controller/observability/observability.go
+++ b/backend/controller/observability/observability.go
@@ -3,6 +3,7 @@ package observability
 import (
 	"errors"
 	"fmt"
+	"math"
 	"time"
 
 	"go.opentelemetry.io/otel/metric"
@@ -47,3 +48,20 @@ func handleInt64UpDownCounterError(counter string, err error, errs error) (metri
 func timeSinceMS(start time.Time) int64 {
 	return time.Since(start).Milliseconds()
 }
+
+// logBucket returns a string bucket label for a given positive number bucketed into
+// powers of some arbitrary base. For base 8, for example, we would have buckets:
+//
+//	<1, [1-8), [8-64), [64-512), etc.
+//
+// Go only supports a few bases with math.Log*, so this func performs a change of base:
+// log_b(x) = log_a(x) / log_a(b)
+func logBucket(base int, num int64) string {
+	if num < 1 {
+		return "<1"
+	}
+	b := float64(base)
+	logB := math.Log(float64(num)) / math.Log(b)
+	bucketExpLo := math.Floor(logB)
+	return fmt.Sprintf("[%d,%d)", int(math.Pow(b, bucketExpLo)), int(math.Pow(b, bucketExpLo+1)))
+}
diff --git a/backend/controller/observability/observability_test.go b/backend/controller/observability/observability_test.go
new file mode 100644
index 0000000000..70f2dec9f5
--- /dev/null
+++ b/backend/controller/observability/observability_test.go
@@ -0,0 +1,53 @@
+package observability
+
+import (
+	"testing"
+
+	"github.com/alecthomas/assert/v2"
+)
+
+func TestLogBucket(t *testing.T) {
+	tests := []struct {
+		name string
+		base int
+		num  int
+		want string
+	}{
+		{
+			name: "<1",
+			base: 8,
+			num:  0,
+			want: "<1",
+		},
+		{
+			name: "EqualLowEndOfRange",
+			base: 8,
+			num:  1,
+			want: "[1,8)",
+		},
+		{
+			name: "HigherEndOfRange",
+			base: 8,
+			num:  7,
+			want: "[1,8)",
+		},
+		{
+			name: "BigInputNum",
+			base: 8,
+			num:  16800000,
+			want: "[16777216,134217728)",
+		},
+		{
+			name: "Base2",
+			base: 2,
+			num:  8,
+			want: "[8,16)",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			assert.Equal(t, test.want, logBucket(test.base, int64(test.num)))
+		})
+	}
+}