diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts new file mode 100644 index 00000000000..ee6f6a68b6c --- /dev/null +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -0,0 +1,49 @@ +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { Timer } from '@aztec/foundation/timer'; + +import { EVENT_LOOP_LAG } from './metrics.js'; +import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js'; + +/** + * Detector for custom Aztec attributes + */ +export class EventLoopMonitor { + private eventLoopLag: ObservableGauge; + private started = false; + + constructor(meter: Meter) { + this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { + unit: 'us', + valueType: ValueType.INT, + description: 'How busy is the event loop', + }); + } + + start(): void { + if (this.started) { + return; + } + this.eventLoopLag.addCallback(this.measureLag); + } + + stop(): void { + if (!this.started) { + return; + } + this.eventLoopLag.removeCallback(this.measureLag); + } + + private measureLag = async (obs: ObservableResult): Promise => { + const timer = new Timer(); + const { promise, resolve } = promiseWithResolvers(); + // how long does it take to schedule the next macro task? + // if this number spikes then we're (1) either blocking the event loop with long running sync code + // or (2) spamming the event loop with micro tasks + setImmediate(() => { + resolve(timer.us()); + }); + + const lag = await promise; + obs.observe(Math.floor(lag)); + }; +} diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 64e26377282..1561d4a9b60 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -125,3 +125,5 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count'; export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time'; export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count'; + +export const EVENT_LOOP_LAG = 'aztec.event_loop_lag'; diff --git a/yarn-project/telemetry-client/src/otel.ts b/yarn-project/telemetry-client/src/otel.ts index e8b6d767b48..81e534b37a2 100644 --- a/yarn-project/telemetry-client/src/otel.ts +++ b/yarn-project/telemetry-client/src/otel.ts @@ -27,6 +27,7 @@ import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic-conventions'; import { type TelemetryClientConfig } from './config.js'; +import { EventLoopMonitor } from './event_loop_monitor.js'; import { linearBuckets } from './histogram_utils.js'; import { registerOtelLoggerProvider } from './otel_logger_provider.js'; import { getOtelResource } from './otel_resource.js'; @@ -34,6 +35,7 @@ import { type Gauge, type TelemetryClient } from './telemetry.js'; export class OpenTelemetryClient implements TelemetryClient { hostMetrics: HostMetrics | undefined; + eventLoopMonitor: EventLoopMonitor | undefined; targetInfo: Gauge | undefined; private meters: Map = new Map(); private tracers: Map = new Map(); @@ -87,6 +89,10 @@ export class OpenTelemetryClient implements TelemetryClient { meterProvider: this.meterProvider, }); + this.eventLoopMonitor = new EventLoopMonitor( + this.meterProvider.getMeter(this.resource.attributes[ATTR_SERVICE_NAME] as string), + ); + // See these two links for more information on providing target information: // https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#resource-attributes // https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#supporting-target-metadata-in-both-push-based-and-pull-based-systems @@ -96,6 +102,7 @@ export class OpenTelemetryClient implements TelemetryClient { this.targetInfo.record(1, this.resource.attributes); this.hostMetrics.start(); + this.eventLoopMonitor.start(); } public isEnabled() { @@ -111,6 +118,8 @@ export class OpenTelemetryClient implements TelemetryClient { } public async stop() { + this.eventLoopMonitor?.stop(); + const flushAndShutdown = async (provider: { forceFlush: () => Promise; shutdown: () => Promise }) => { await provider.forceFlush(); await provider.shutdown();