-
Notifications
You must be signed in to change notification settings - Fork 302
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: track nodejs runtime metrics (#11160)
This PR implements the OTEL nodejs runtime recommended metrics https://opentelemetry.io/docs/specs/semconv/runtime/nodejs-metrics/
- Loading branch information
Showing
15 changed files
with
244 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 92 additions & 22 deletions
114
yarn-project/telemetry-client/src/event_loop_monitor.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,119 @@ | ||
import { promiseWithResolvers } from '@aztec/foundation/promise'; | ||
import { Timer } from '@aztec/foundation/timer'; | ||
import { type EventLoopUtilization, type IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; | ||
|
||
import { EVENT_LOOP_LAG } from './metrics.js'; | ||
import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js'; | ||
import { NODEJS_EVENT_LOOP_STATE } from './attributes.js'; | ||
import * as Metrics from './metrics.js'; | ||
import { | ||
type BatchObservableResult, | ||
type Meter, | ||
type ObservableGauge, | ||
type UpDownCounter, | ||
ValueType, | ||
} from './telemetry.js'; | ||
|
||
/** | ||
* Detector for custom Aztec attributes | ||
*/ | ||
export class EventLoopMonitor { | ||
private eventLoopLag: ObservableGauge; | ||
private eventLoopDelayGauges: { | ||
min: ObservableGauge; | ||
max: ObservableGauge; | ||
mean: ObservableGauge; | ||
stddev: ObservableGauge; | ||
p50: ObservableGauge; | ||
p90: ObservableGauge; | ||
p99: ObservableGauge; | ||
}; | ||
|
||
private eventLoopUilization: ObservableGauge; | ||
private eventLoopTime: UpDownCounter; | ||
|
||
private started = false; | ||
|
||
constructor(meter: Meter) { | ||
this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { | ||
unit: 'us', | ||
valueType: ValueType.INT, | ||
private lastELU: EventLoopUtilization | undefined; | ||
private eventLoopDelay: IntervalHistogram; | ||
|
||
constructor(private meter: Meter) { | ||
const nsObsGauge = (name: (typeof Metrics)[keyof typeof Metrics], description: string) => | ||
meter.createObservableGauge(name, { | ||
unit: 'ns', | ||
valueType: ValueType.INT, | ||
description, | ||
}); | ||
|
||
this.eventLoopDelayGauges = { | ||
min: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MIN, 'Minimum delay of the event loop'), | ||
mean: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MEAN, 'Mean delay of the event loop'), | ||
max: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MAX, 'Max delay of the event loop'), | ||
stddev: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_STDDEV, 'Stddev delay of the event loop'), | ||
p50: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P50, 'P50 delay of the event loop'), | ||
p90: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P90, 'P90 delay of the event loop'), | ||
p99: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P99, 'P99 delay of the event loop'), | ||
}; | ||
|
||
this.eventLoopUilization = meter.createObservableGauge(Metrics.NODEJS_EVENT_LOOP_UTILIZATION, { | ||
valueType: ValueType.DOUBLE, | ||
description: 'How busy is the event loop', | ||
}); | ||
|
||
this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { | ||
unit: 'ms', | ||
valueType: ValueType.INT, | ||
description: 'How much time the event loop has spent in a given state', | ||
}); | ||
|
||
this.eventLoopDelay = monitorEventLoopDelay(); | ||
} | ||
|
||
start(): void { | ||
if (this.started) { | ||
return; | ||
} | ||
this.eventLoopLag.addCallback(this.measureLag); | ||
|
||
this.lastELU = performance.eventLoopUtilization(); | ||
this.eventLoopDelay.enable(); | ||
this.meter.addBatchObservableCallback(this.measure, [ | ||
this.eventLoopUilization, | ||
...Object.values(this.eventLoopDelayGauges), | ||
]); | ||
} | ||
|
||
stop(): void { | ||
if (!this.started) { | ||
return; | ||
} | ||
this.eventLoopLag.removeCallback(this.measureLag); | ||
this.meter.removeBatchObservableCallback(this.measure, [ | ||
this.eventLoopUilization, | ||
...Object.values(this.eventLoopDelayGauges), | ||
]); | ||
this.eventLoopDelay.disable(); | ||
this.eventLoopDelay.reset(); | ||
this.lastELU = undefined; | ||
} | ||
|
||
private measureLag = async (obs: ObservableResult): Promise<void> => { | ||
const timer = new Timer(); | ||
const { promise, resolve } = promiseWithResolvers<number>(); | ||
// how long does it take to schedule the next macro task? | ||
// if this number spikes then we're (1) either blocking the event loop with long running sync code | ||
// or (2) spamming the event loop with micro tasks | ||
setImmediate(() => { | ||
resolve(timer.us()); | ||
}); | ||
private measure = (obs: BatchObservableResult): void => { | ||
const newELU = performance.eventLoopUtilization(); | ||
const delta = performance.eventLoopUtilization(newELU, this.lastELU); | ||
this.lastELU = newELU; | ||
|
||
// `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in | ||
// This should be corelated with CPU usage to gauge the performance characteristics of services | ||
// 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly. | ||
// Docs and examples: | ||
// - https://nodesource.com/blog/event-loop-utilization-nodejs | ||
// - https://youtu.be/WetXnEPraYM | ||
obs.observe(this.eventLoopUilization, delta.utilization); | ||
|
||
this.eventLoopTime.add(Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' }); | ||
this.eventLoopTime.add(Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' }); | ||
|
||
obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min)); | ||
obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean)); | ||
obs.observe(this.eventLoopDelayGauges.max, Math.floor(this.eventLoopDelay.max)); | ||
obs.observe(this.eventLoopDelayGauges.stddev, Math.floor(this.eventLoopDelay.stddev)); | ||
obs.observe(this.eventLoopDelayGauges.p50, Math.floor(this.eventLoopDelay.percentile(50))); | ||
obs.observe(this.eventLoopDelayGauges.p90, Math.floor(this.eventLoopDelay.percentile(90))); | ||
obs.observe(this.eventLoopDelayGauges.p99, Math.floor(this.eventLoopDelay.percentile(99))); | ||
|
||
const lag = await promise; | ||
obs.observe(Math.floor(lag)); | ||
this.eventLoopDelay.reset(); | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.