Skip to content

Commit

Permalink
feat: assert metrics in network tests (#10215)
Browse files Browse the repository at this point in the history
Break out alert checker, and use it from the "gating-passive" and
"smoke" spartan tests.
  • Loading branch information
just-mitch authored Nov 26, 2024
1 parent 10d3f6f commit 9380c0f
Show file tree
Hide file tree
Showing 9 changed files with 199 additions and 105 deletions.
4 changes: 1 addition & 3 deletions yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@ docker run \
-e METRICS_PORT="4318" \
-e COLLECT_METRICS="true" \
-e PULL_REQUEST="$PULL_REQUEST" \
-e CHECK_ALERTS="true" \
$env_args \
--rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG \
"$test_path" "$@" || [ "$ignore_failures" = "true" ]


echo "Running alert checker..."
docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts
10 changes: 8 additions & 2 deletions yarn-project/end-to-end/scripts/network_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,15 @@ helm upgrade --install spartan "$REPO/spartan/aztec-network/" \

kubectl wait pod -l app==pxe --for=condition=Ready -n "$NAMESPACE" --timeout=10m

# Find two free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 2)
# Find 3 free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 3)

# Extract the two free ports from the list
PXE_PORT=$(echo $FREE_PORTS | awk '{print $1}')
ANVIL_PORT=$(echo $FREE_PORTS | awk '{print $2}')
METRICS_PORT=$(echo $FREE_PORTS | awk '{print $3}')

GRAFANA_PASSWORD=$(kubectl get secrets -n metrics metrics-grafana -o jsonpath='{.data.admin-password}' | base64 --decode)

# Namespace variable (assuming it's set)
NAMESPACE=${NAMESPACE:-default}
Expand All @@ -170,6 +173,9 @@ docker run --rm --network=host \
-e CONTAINER_PXE_PORT=8081 \
-e HOST_ETHEREUM_PORT=$ANVIL_PORT \
-e CONTAINER_ETHEREUM_PORT=8545 \
-e HOST_METRICS_PORT=$METRICS_PORT \
-e CONTAINER_METRICS_PORT=80 \
-e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \
-e DEBUG="aztec:*" \
-e LOG_JSON=1 \
-e LOG_LEVEL=debug \
Expand Down
20 changes: 20 additions & 0 deletions yarn-project/end-to-end/src/e2e_p2p/gossip_network.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,29 @@ import fs from 'fs';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { type NodeContext, createNodes } from '../fixtures/setup_p2p_test.js';
import { AlertChecker, type AlertConfig } from '../quality_of_service/alert_checker.js';
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
import { createPXEServiceAndSubmitTransactions } from './shared.js';

const CHECK_ALERTS = process.env.CHECK_ALERTS === 'true';

// Don't set this to a higher value than 9 because each node will use a different L1 publisher account and anvil seeds
const NUM_NODES = 4;
const NUM_TXS_PER_NODE = 2;
const BOOT_NODE_UDP_PORT = 40600;

const DATA_DIR = './data/gossip';

const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'aztec_sequencer_time_to_collect_attestations > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

describe('e2e_p2p_network', () => {
let t: P2PNetworkTest;
let nodes: AztecNodeService[];
Expand All @@ -39,6 +52,13 @@ describe('e2e_p2p_network', () => {
}
});

afterAll(async () => {
if (CHECK_ALERTS) {
const checker = new AlertChecker(t.logger);
await checker.runAlertCheck(qosAlerts);
}
});

it('should rollup txs from all peers', async () => {
// create the bootstrap node for the network
if (!t.bootstrapNodeEnr) {
Expand Down

This file was deleted.

105 changes: 105 additions & 0 deletions yarn-project/end-to-end/src/quality_of_service/alert_checker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { type DebugLogger } from '@aztec/aztec.js';

import * as fs from 'fs';
import * as yaml from 'js-yaml';

export interface AlertConfig {
alert: string;
expr: string;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
}

export interface AlertCheckerConfig {
grafanaEndpoint: string;
grafanaCredentials: string;
}

// This config is good if you're running the otel-lgtm stack locally
const DEFAULT_CONFIG: AlertCheckerConfig = {
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query',
grafanaCredentials: 'admin:admin',
};

export class AlertChecker {
private config: AlertCheckerConfig;
private logger: DebugLogger;

constructor(logger: DebugLogger, config: Partial<AlertCheckerConfig> = {}) {
this.config = { ...DEFAULT_CONFIG, ...config };
this.logger = logger;
}

/**
* Load the alerts config from a file path.
* @param filePath - The absolute path to the alerts file.
*/
private loadAlertsConfig(filePath: string): AlertConfig[] {
const fileContents = fs.readFileSync(filePath, 'utf8');
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
return data.alerts;
}

private async queryGrafana(expr: string): Promise<number> {
const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64');

const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, {
headers: {
Authorization: `Basic ${credentials}`,
},
});

if (!response.ok) {
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
}

const data = await response.json();
const result = data.data.result;
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
}

private async checkAlerts(alerts: AlertConfig[]): Promise<void> {
let alertTriggered = false;

for (const alert of alerts) {
this.logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await this.queryGrafana(alert.expr);
this.logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
alertTriggered = true;
} else {
this.logger.info(`Alert ${alert.alert} passed.`);
}
}

if (alertTriggered) {
throw new Error('Test failed due to triggered alert');
}
}

/**
* Run the alert check based on the alerts defined in an array.
* @param alerts - The alerts to check.
*/
public async runAlertCheck(alerts: AlertConfig[]): Promise<void> {
try {
await this.checkAlerts(alerts);
this.logger.info('All alerts passed.');
} catch (error) {
this.logger.error(error instanceof Error ? error.message : String(error));
throw error;
}
}

/**
* Run the alert check based on the alerts defined in a yaml file.
* @param filePath - The absolute path to the alerts file.
*/
public async runAlertCheckFromFilePath(filePath: string): Promise<void> {
const alerts = this.loadAlertsConfig(filePath);
await this.checkAlerts(alerts);
}
}
10 changes: 0 additions & 10 deletions yarn-project/end-to-end/src/quality_of_service/alerts.yaml

This file was deleted.

23 changes: 23 additions & 0 deletions yarn-project/end-to-end/src/spartan/gating-passive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createDebugLogger } from '@aztec/foundation/log';
import { expect, jest } from '@jest/globals';

import { RollupCheatCodes } from '../../../aztec.js/src/utils/cheat_codes.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import {
applyBootNodeFailure,
applyNetworkShaping,
Expand All @@ -13,9 +14,20 @@ import {
getConfig,
isK8sConfig,
restartBot,
runAlertCheck,
startPortForward,
} from './utils.js';

const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

const config = getConfig(process.env);
if (!isK8sConfig(config)) {
throw new Error('This test must be run in a k8s environment');
Expand All @@ -39,6 +51,10 @@ describe('a test that passively observes the network in the presence of network
// 50% is the max that we expect to miss
const MAX_MISSED_SLOT_PERCENT = 0.5;

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('survives network chaos', async () => {
await startPortForward({
resource: `svc/${config.INSTANCE_NAME}-aztec-network-pxe`,
Expand All @@ -52,6 +68,13 @@ describe('a test that passively observes the network in the presence of network
containerPort: CONTAINER_ETHEREUM_PORT,
hostPort: HOST_ETHEREUM_PORT,
});

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
const client = await createCompatibleClient(PXE_URL, debugLogger);
const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST);
const rollupCheatCodes = new RollupCheatCodes(
Expand Down
27 changes: 25 additions & 2 deletions yarn-project/end-to-end/src/spartan/smoke.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@ import { RollupAbi } from '@aztec/l1-artifacts';
import { createPublicClient, getAddress, getContract, http } from 'viem';
import { foundry } from 'viem/chains';

import { getConfig, isK8sConfig, startPortForward } from './utils.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import { getConfig, isK8sConfig, runAlertCheck, startPortForward } from './utils.js';

const config = getConfig(process.env);

const debugLogger = createDebugLogger('aztec:spartan-test:smoke');
// const userLog = createConsoleLogger();

// QoS alerts for when we are running in k8s
const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

describe('smoke test', () => {
let pxe: PXE;
Expand All @@ -24,11 +35,23 @@ describe('smoke test', () => {
hostPort: config.HOST_PXE_PORT,
});
PXE_URL = `http://127.0.0.1:${config.HOST_PXE_PORT}`;

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
} else {
PXE_URL = config.PXE_URL;
}
pxe = await createCompatibleClient(PXE_URL, debugLogger);
});

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('should be able to get node enr', async () => {
const info = await pxe.getNodeInfo();
expect(info).toBeDefined();
Expand Down
Loading

0 comments on commit 9380c0f

Please sign in to comment.