Skip to content

Commit

Permalink
feat: track nodejs runtime metrics (#11160)
Browse files Browse the repository at this point in the history
This PR implements the OTEL nodejs runtime recommended metrics
https://opentelemetry.io/docs/specs/semconv/runtime/nodejs-metrics/
  • Loading branch information
alexghr authored Jan 14, 2025
1 parent c107b6b commit 1d24fab
Show file tree
Hide file tree
Showing 15 changed files with 244 additions and 33 deletions.
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/boot-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: boot-node
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.bootNode.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down
10 changes: 10 additions & 0 deletions spartan/aztec-network/templates/faucet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ spec:
value: "1"
- name: LOG_LEVEL
value: "{{ .Values.faucet.logLevel }}"
- name: K8S_POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: faucet
ports:
- name: http
containerPort: {{ .Values.faucet.service.nodePort }}
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-agent
- name: AZTEC_PORT
value: "{{ .Values.proverAgent.service.nodePort }}"
- name: LOG_LEVEL
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-broker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-broker
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.proverBroker.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-node
- name: POD_IP
valueFrom:
fieldRef:
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/pxe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: pxe
- name: AZTEC_PORT
value: "{{ .Values.pxe.service.nodePort }}"
- name: LOG_JSON
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/transaction-bot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: bot
- name: AZTEC_PORT
value: "{{ .Values.bot.service.nodePort }}"
- name: LOG_JSON
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: validator
- name: POD_IP
valueFrom:
fieldRef:
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/telemetry-client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"@opentelemetry/exporter-trace-otlp-http": "^0.55.0",
"@opentelemetry/host-metrics": "^0.35.4",
"@opentelemetry/otlp-exporter-base": "^0.55.0",
"@opentelemetry/resource-detector-aws": "^1.8.0",
"@opentelemetry/resource-detector-gcp": "^0.32.0",
"@opentelemetry/resources": "^1.28.0",
"@opentelemetry/sdk-logs": "^0.55.0",
"@opentelemetry/sdk-metrics": "^1.28.0",
Expand Down
2 changes: 2 additions & 0 deletions yarn-project/telemetry-client/src/attributes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@ export const REVERTIBILITY = 'aztec.revertibility';
export const GAS_DIMENSION = 'aztec.gas_dimension';

export const WORLD_STATE_REQUEST_TYPE = 'aztec.world_state_request';

export const NODEJS_EVENT_LOOP_STATE = 'nodejs.eventloop.state';
10 changes: 7 additions & 3 deletions yarn-project/telemetry-client/src/aztec_resource_detector.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import { type DetectorSync, type IResource, Resource } from '@opentelemetry/resources';
import { ATTR_K8S_POD_NAME, ATTR_K8S_POD_UID } from '@opentelemetry/semantic-conventions/incubating';
import {
ATTR_K8S_POD_NAME,
ATTR_K8S_POD_UID,
ATTR_SERVICE_INSTANCE_ID,
} from '@opentelemetry/semantic-conventions/incubating';

import { NETWORK_NAME } from './attributes.js';
import { getConfigEnvVars } from './config.js';

/**
Expand All @@ -12,9 +15,10 @@ class AztecDetector implements DetectorSync {
const config = getConfigEnvVars();

return new Resource({
[NETWORK_NAME]: config.networkName,
[ATTR_K8S_POD_UID]: config.k8sPodUid,
[ATTR_K8S_POD_NAME]: config.k8sPodName,
// this will get set by serviceInstanceIdDetector if not running in K8s
[ATTR_SERVICE_INSTANCE_ID]: config.k8sPodUid,
});
}
}
Expand Down
114 changes: 92 additions & 22 deletions yarn-project/telemetry-client/src/event_loop_monitor.ts
Original file line number Diff line number Diff line change
@@ -1,49 +1,119 @@
import { promiseWithResolvers } from '@aztec/foundation/promise';
import { Timer } from '@aztec/foundation/timer';
import { type EventLoopUtilization, type IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks';

import { EVENT_LOOP_LAG } from './metrics.js';
import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js';
import { NODEJS_EVENT_LOOP_STATE } from './attributes.js';
import * as Metrics from './metrics.js';
import {
type BatchObservableResult,
type Meter,
type ObservableGauge,
type UpDownCounter,
ValueType,
} from './telemetry.js';

/**
* Detector for custom Aztec attributes
*/
export class EventLoopMonitor {
private eventLoopLag: ObservableGauge;
private eventLoopDelayGauges: {
min: ObservableGauge;
max: ObservableGauge;
mean: ObservableGauge;
stddev: ObservableGauge;
p50: ObservableGauge;
p90: ObservableGauge;
p99: ObservableGauge;
};

private eventLoopUilization: ObservableGauge;
private eventLoopTime: UpDownCounter;

private started = false;

constructor(meter: Meter) {
this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, {
unit: 'us',
valueType: ValueType.INT,
private lastELU: EventLoopUtilization | undefined;
private eventLoopDelay: IntervalHistogram;

constructor(private meter: Meter) {
const nsObsGauge = (name: (typeof Metrics)[keyof typeof Metrics], description: string) =>
meter.createObservableGauge(name, {
unit: 'ns',
valueType: ValueType.INT,
description,
});

this.eventLoopDelayGauges = {
min: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MIN, 'Minimum delay of the event loop'),
mean: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MEAN, 'Mean delay of the event loop'),
max: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MAX, 'Max delay of the event loop'),
stddev: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_STDDEV, 'Stddev delay of the event loop'),
p50: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P50, 'P50 delay of the event loop'),
p90: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P90, 'P90 delay of the event loop'),
p99: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P99, 'P99 delay of the event loop'),
};

this.eventLoopUilization = meter.createObservableGauge(Metrics.NODEJS_EVENT_LOOP_UTILIZATION, {
valueType: ValueType.DOUBLE,
description: 'How busy is the event loop',
});

this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, {
unit: 'ms',
valueType: ValueType.INT,
description: 'How much time the event loop has spent in a given state',
});

this.eventLoopDelay = monitorEventLoopDelay();
}

start(): void {
if (this.started) {
return;
}
this.eventLoopLag.addCallback(this.measureLag);

this.lastELU = performance.eventLoopUtilization();
this.eventLoopDelay.enable();
this.meter.addBatchObservableCallback(this.measure, [
this.eventLoopUilization,
...Object.values(this.eventLoopDelayGauges),
]);
}

stop(): void {
if (!this.started) {
return;
}
this.eventLoopLag.removeCallback(this.measureLag);
this.meter.removeBatchObservableCallback(this.measure, [
this.eventLoopUilization,
...Object.values(this.eventLoopDelayGauges),
]);
this.eventLoopDelay.disable();
this.eventLoopDelay.reset();
this.lastELU = undefined;
}

private measureLag = async (obs: ObservableResult): Promise<void> => {
const timer = new Timer();
const { promise, resolve } = promiseWithResolvers<number>();
// how long does it take to schedule the next macro task?
// if this number spikes then we're (1) either blocking the event loop with long running sync code
// or (2) spamming the event loop with micro tasks
setImmediate(() => {
resolve(timer.us());
});
private measure = (obs: BatchObservableResult): void => {
const newELU = performance.eventLoopUtilization();
const delta = performance.eventLoopUtilization(newELU, this.lastELU);
this.lastELU = newELU;

// `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in
// This should be corelated with CPU usage to gauge the performance characteristics of services
// 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly.
// Docs and examples:
// - https://nodesource.com/blog/event-loop-utilization-nodejs
// - https://youtu.be/WetXnEPraYM
obs.observe(this.eventLoopUilization, delta.utilization);

this.eventLoopTime.add(Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' });
this.eventLoopTime.add(Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' });

obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min));
obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean));
obs.observe(this.eventLoopDelayGauges.max, Math.floor(this.eventLoopDelay.max));
obs.observe(this.eventLoopDelayGauges.stddev, Math.floor(this.eventLoopDelay.stddev));
obs.observe(this.eventLoopDelayGauges.p50, Math.floor(this.eventLoopDelay.percentile(50)));
obs.observe(this.eventLoopDelayGauges.p90, Math.floor(this.eventLoopDelay.percentile(90)));
obs.observe(this.eventLoopDelayGauges.p99, Math.floor(this.eventLoopDelay.percentile(99)));

const lag = await promise;
obs.observe(Math.floor(lag));
this.eventLoopDelay.reset();
};
}
11 changes: 10 additions & 1 deletion yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,13 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count';
export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time';
export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count';

export const EVENT_LOOP_LAG = 'aztec.event_loop_lag';
export const NODEJS_EVENT_LOOP_DELAY_MIN = 'nodejs.eventloop.delay.min';
export const NODEJS_EVENT_LOOP_DELAY_MEAN = 'nodejs.eventloop.delay.mean';
export const NODEJS_EVENT_LOOP_DELAY_MAX = 'nodejs.eventloop.delay.max';
export const NODEJS_EVENT_LOOP_DELAY_STDDEV = 'nodejs.eventloop.delay.stddev';
export const NODEJS_EVENT_LOOP_DELAY_P50 = 'nodejs.eventloop.delay.p50';
export const NODEJS_EVENT_LOOP_DELAY_P90 = 'nodejs.eventloop.delay.p90';
export const NODEJS_EVENT_LOOP_DELAY_P99 = 'nodejs.eventloop.delay.p99';

export const NODEJS_EVENT_LOOP_UTILIZATION = 'nodejs.eventloop.utilization';
export const NODEJS_EVENT_LOOP_TIME = 'nodejs.eventloop.time';
10 changes: 9 additions & 1 deletion yarn-project/telemetry-client/src/otel_resource.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// import { gcpDetector } from '@opentelemetry/resource-detector-gcp';
import {
type IResource,
detectResourcesSync,
Expand All @@ -11,7 +12,14 @@ import { aztecDetector } from './aztec_resource_detector.js';

export async function getOtelResource(): Promise<IResource> {
const resource = detectResourcesSync({
detectors: [osDetectorSync, envDetectorSync, processDetectorSync, serviceInstanceIdDetectorSync, aztecDetector],
detectors: [
osDetectorSync,
envDetectorSync,
processDetectorSync,
serviceInstanceIdDetectorSync,
// gcpDetector,
aztecDetector,
],
});

if (resource.asyncAttributesPending) {
Expand Down
Loading

0 comments on commit 1d24fab

Please sign in to comment.