Skip to content

Commit

Permalink
feat: Prover e2e test (#2975)
Browse files Browse the repository at this point in the history
## What ❔

Add workflow that runs proving for a genesis batch.
Update dockerfiles and docker compose for GPU
Add circuit prover to zkstack CLI.
Fix HTTP URL for prover gateway.

## Why ❔

To detect possible runtime issues.

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [ ] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [ ] Code has been formatted via `zk fmt` and `zk lint`.
  • Loading branch information
Artemka374 authored Oct 15, 2024
1 parent 331fe87 commit 2245b35
Show file tree
Hide file tree
Showing 19 changed files with 488 additions and 54 deletions.
127 changes: 127 additions & 0 deletions .github/workflows/ci-prover-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: Workflow for testing prover component end-to-end
on:
workflow_call:

jobs:
e2e-test:
runs-on: [ matterlabs-ci-gpu-l4-runner-prover-tests ]
env:
RUNNER_COMPOSE_FILE: "docker-compose-gpu-runner-cuda-12-0.yml"

steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4
with:
submodules: "recursive"
fetch-depth: 0

- name: Setup environment
run: |
echo ZKSYNC_HOME=$(pwd) >> $GITHUB_ENV
echo $(pwd)/bin >> $GITHUB_PATH
echo IN_DOCKER=1 >> .env
echo "SCCACHE_GCS_BUCKET=matterlabs-infra-sccache-storage" >> .env
echo "SCCACHE_GCS_SERVICE_ACCOUNT=gha-ci-runners@matterlabs-infra.iam.gserviceaccount.com" >> .env
echo "SCCACHE_GCS_RW_MODE=READ_WRITE" >> .env
echo "RUSTC_WRAPPER=sccache" >> .env
mkdir -p prover_logs
- name: Start services
run: |
run_retried docker-compose -f ${RUNNER_COMPOSE_FILE} pull
mkdir -p ./volumes/postgres ./volumes/reth/data
docker-compose -f ${RUNNER_COMPOSE_FILE} --profile runner up -d --wait
ci_run sccache --start-server
- name: Init
run: |
ci_run git config --global --add safe.directory "*"
ci_run chmod -R +x ./bin
ci_run ./zkstack_cli/zkstackup/install -g --path ./zkstack_cli/zkstackup/zkstackup || true
ci_run zkstackup -g --local
ci_run zkstack chain create \
--chain-name proving_chain \
--chain-id sequential \
--prover-mode gpu \
--wallet-creation localhost \
--l1-batch-commit-data-generator-mode rollup \
--base-token-address 0x0000000000000000000000000000000000000001 \
--base-token-price-nominator 1 \
--base-token-price-denominator 1 \
--set-as-default true \
--ignore-prerequisites
ci_run zkstack ecosystem init --dev --verbose
ci_run zkstack prover init --dev --verbose
echo "URL=$(grep "http_url" ./chains/proving_chain/configs/general.yaml | awk '{ print $2 }')" >> $GITHUB_ENV
- name: Build prover binaries
run: |
ci_run cargo build --release --workspace --manifest-path=prover/Cargo.toml
- name: Prepare prover subsystem
run: |
ci_run zkstack prover init-bellman-cuda --clone --verbose
ci_run zkstack prover setup-keys --mode=download --region=us --verbose
- name: Run server
run: |
ci_run zkstack server --uring --chain=proving_chain --components=api,tree,eth,state_keeper,commitment_generator,proof_data_handler,vm_runner_protective_reads,vm_runner_bwip &>prover_logs/server.log &
- name: Run Gateway
run: |
ci_run zkstack prover run --component=gateway --docker=false &>prover_logs/gateway.log &
- name: Run Prover Job Monitor
run: |
ci_run zkstack prover run --component=prover-job-monitor --docker=false &>prover_logs/prover-job-monitor.log &
- name: Wait for batch to be passed through gateway
env:
DATABASE_URL: postgres://postgres:notsecurepassword@localhost:5432/zksync_prover_localhost_proving_chain
BATCH_NUMBER: 1
INTERVAL: 30
TIMEOUT: 300
run: |
PASSED_ENV_VARS="DATABASE_URL,BATCH_NUMBER,INTERVAL,TIMEOUT" \
ci_run ./bin/prover_checkers/batch_availability_checker
- name: Run Witness Generator
run: |
ci_run zkstack prover run --component=witness-generator --round=all-rounds --docker=false &>prover_logs/witness-generator.log &
- name: Run Circuit Prover
run: |
ci_run zkstack prover run --component=circuit-prover --witness-vector-generator-count=10 --docker=false &>prover_logs/circuit_prover.log &
- name: Wait for prover jobs to finish
env:
DATABASE_URL: postgres://postgres:notsecurepassword@localhost:5432/zksync_prover_localhost_proving_chain
BATCH_NUMBER: 1
INTERVAL: 30
TIMEOUT: 1200
run: |
PASSED_ENV_VARS="DATABASE_URL,BATCH_NUMBER,INTERVAL,TIMEOUT" \
ci_run ./bin/prover_checkers/prover_jobs_status_checker
- name: Kill prover & start compressor
run: |
sudo ./bin/prover_checkers/kill_prover
ci_run zkstack prover run --component=compressor --docker=false &>prover_logs/compressor.log &
- name: Wait for batch to be executed on L1
env:
DATABASE_URL: postgres://postgres:notsecurepassword@localhost:5432/zksync_prover_localhost_proving_chain
BATCH_NUMBER: 1
INTERVAL: 30
TIMEOUT: 600
run: |
PASSED_ENV_VARS="BATCH_NUMBER,DATABASE_URL,URL,INTERVAL,TIMEOUT" \
ci_run ./bin/prover_checkers/batch_l1_status_checker
- name: Upload logs
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
if: always()
with:
name: prover_logs
path: prover_logs

- name: Show sccache logs
if: always()
run: |
ci_run sccache --show-stats || true
ci_run cat /tmp/sccache_log.txt || true
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ jobs:
name: CI for Prover Components
uses: ./.github/workflows/ci-prover-reusable.yml

e2e-for-prover:
name: E2E Test for Prover Components
needs: changed_files
if: ${{(needs.changed_files.outputs.prover == 'true' || needs.changed_files.outputs.all == 'true') && !contains(github.ref_name, 'release-please--branches') }}
uses: ./.github/workflows/ci-prover-e2e.yml

ci-for-docs:
needs: changed_files
if: needs.changed_files.outputs.docs == 'true'
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/zk-environment-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ jobs:
- docker/zk-environment/Dockerfile
- .github/workflows/zk-environment-publish.yml
zk_env_cuda_11_8:
- docker/zk-environment/20.04_amd64_cuda_11_8.Dockerfile
- docker/zk-environment/22.04_amd64_cuda_11_8.Dockerfile
- .github/workflows/zk-environment-publish.yml
zk_env_cuda_12:
- docker/zk-environment/20.04_amd64_cuda_12_0.Dockerfile
- docker/zk-environment/22.04_amd64_cuda_12_0.Dockerfile
- .github/workflows/zk-environment-publish.yml
get_short_sha:
Expand Down Expand Up @@ -245,7 +245,7 @@ jobs:
if: ${{ (steps.condition.outputs.should_run == 'true') || (github.event_name == 'workflow_dispatch' && inputs.build_cuda) }}
uses: docker/build-push-action@5176d81f87c23d6fc96624dfdbcd9f3830bbe445 # v6.5.0
with:
file: docker/zk-environment/20.04_amd64_cuda_${{ matrix.cuda_version }}.Dockerfile
file: docker/zk-environment/22.04_amd64_cuda_${{ matrix.cuda_version }}.Dockerfile
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || (github.event_name == 'workflow_dispatch' && inputs.build_cuda) }}
tags: |
us-docker.pkg.dev/matterlabs-infra/matterlabs-docker/zk-environment-cuda-${{ matrix.cuda_version }}:latest
Expand Down
40 changes: 40 additions & 0 deletions bin/prover_checkers/batch_availability_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail

# Configuration
# DATABASE_URL - The URL of the prover database to connect to
# BATCH_NUMBER - The batch number to check availability for
# INTERVAL - Time interval for polling in seconds
# TIMEOUT - Timeout of script in seconds

# Start timer
START_TIME=$(date +%s)

# Loop to query periodically
while true; do
# Calculate the elapsed time
CURRENT_TIME=$(date +%s)
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))

# Check if the timeout has been reached
if [ $ELAPSED_TIME -ge $TIMEOUT ]; then
echo "Timeout reached. Failing CI..."
exit 1 # Exit with non-zero status to fail CI
fi

# Run the SQL query and capture the result
RESULT=$(psql $DATABASE_URL -c "SELECT count(*) FROM witness_inputs_fri WHERE l1_batch_number = $BATCH_NUMBER;" -t -A)

# Check if the result is 1
if [ "$RESULT" -eq 1 ]; then
echo "Query result is 1. Success!"
exit 0 # Exit with zero status to succeed CI
else
echo "Batch is not available yet. Retrying in $INTERVAL seconds..."
fi

# Wait for the next interval
sleep $INTERVAL
done
54 changes: 54 additions & 0 deletions bin/prover_checkers/batch_l1_status_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail

# Needs following configuration
# URL - URL of the API endpoint
# INTERVAL - Time interval for polling in seconds
# TIMEOUT - Timeout of script in seconds

# Start timer
START_TIME=$(date +%s)

echo "URL: $URL"

# Loop to query periodically
while true; do
# Calculate the elapsed time
CURRENT_TIME=$(date +%s)
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))

# Check if the timeout has been reached
if [ $ELAPSED_TIME -ge $TIMEOUT ]; then
echo "Timeout reached. Failing CI..."
exit 1 # Exit with non-zero status to fail CI
fi

# Run the curl request and capture the response
RESPONSE=$(curl --silent --request POST \
--url $URL \
--header 'Content-Type: application/json' \
--data '{
"jsonrpc": "2.0",
"id": 1,
"method": "zks_getBlockDetails",
"params": [1]
}')

# Parse the executedAt field using jq
EXECUTED_AT=$(echo $RESPONSE | jq -r '.result.executedAt')

# Check if executedAt is not null
if [ "$EXECUTED_AT" != "null" ] && [ -n "$EXECUTED_AT" ]; then
echo "executedAt is not null: $EXECUTED_AT"
echo "true"
exit 0 # Exit with zero status to succeed CI
else
DATABASE_STATUS=$(psql $DATABASE_URL -c "SELECT status FROM proof_compression_jobs_fri WHERE l1_batch_number = $BATCH_NUMBER;" -t -A)
echo "executedAt is null, database status is $DATABASE_STATUS, retrying in $INTERVAL seconds..."
fi

# Wait for the next interval
sleep $INTERVAL
done
12 changes: 12 additions & 0 deletions bin/prover_checkers/kill_prover
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail

# Use pkill to find and kill processes using circuit prover
if ! pkill -f 'zksync_circuit_prover|zkstack prover run --component=circuit-prover'; then
echo "No processes are currently using the GPU."
exit 0
fi

echo "All GPU-related processes have been killed."
42 changes: 42 additions & 0 deletions bin/prover_checkers/prover_jobs_status_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail

# Configuration
# DATABASE_URL - The URL of the prover database to connect to
# BATCH_NUMBER - The batch number to check readiness for
# INTERVAL - Time interval for polling in seconds
# TIMEOUT - Timeout of script in seconds

# Start timer
START_TIME=$(date +%s)

# Loop to query periodically
while true; do
# Calculate the elapsed time
CURRENT_TIME=$(date +%s)
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))

# Check if the timeout has been reached
if [ $ELAPSED_TIME -ge $TIMEOUT ]; then
echo "Timeout reached. Failing CI..."
exit 1 # Exit with non-zero status to fail CI
fi

# Run the SQL query and capture the result
RESULT=$(psql $DATABASE_URL -c "SELECT count(*) FROM proof_compression_jobs_fri WHERE l1_batch_number = $BATCH_NUMBER AND status = 'queued';" -t -A)

# Check if the result is 1
if [ "$RESULT" -eq 1 ]; then
echo "Query result is 1. Success!"
exit 0 # Exit with zero status to succeed CI
else
STATUS=$(psql $DATABASE_URL -c "SELECT COUNT(*), status FROM prover_jobs_fri WHERE l1_batch_number = $BATCH_NUMBER GROUP BY status;" -t -A)
echo "Current status is $STATUS"
echo "Retrying in $INTERVAL seconds..."
fi

# Wait for the next interval
sleep $INTERVAL
done
2 changes: 1 addition & 1 deletion core/node/proof_data_handler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub async fn run_server(
mut stop_receiver: watch::Receiver<bool>,
) -> anyhow::Result<()> {
let bind_address = SocketAddr::from(([0, 0, 0, 0], config.http_port));
tracing::debug!("Starting proof data handler server on {bind_address}");
tracing::info!("Starting proof data handler server on {bind_address}");
let app = create_proof_processing_router(blob_store, connection_pool, config, commitment_mode);

let listener = tokio::net::TcpListener::bind(bind_address)
Expand Down
13 changes: 10 additions & 3 deletions docker-compose-gpu-runner-cuda-12-0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ services:
reth:
restart: always
image: "ghcr.io/paradigmxyz/reth:v1.0.6"
ports:
- 127.0.0.1:8545:8545
volumes:
- type: bind
source: ./volumes/reth/data
Expand All @@ -12,11 +14,9 @@ services:
target: /chaindata

command: node --dev --datadir /rethdata --http --http.addr 0.0.0.0 --http.port 8545 --http.corsdomain "*" --dev.block-time 300ms --chain /chaindata/reth_config
ports:
- 127.0.0.1:8545:8545

zk:
image: ghcr.io/matter-labs/zk-environment:cuda-12-0-latest
image: ghcr.io/matter-labs/zk-environment:cuda-12_0-latest
depends_on:
- reth
- postgres
Expand Down Expand Up @@ -49,11 +49,18 @@ services:
- /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools
env_file:
- ./.env
extra_hosts:
- "host:host-gateway"
profiles:
- runner
network_mode: host
pid: host
deploy:
resources:
reservations:
devices:
- capabilities: [ gpu ]

postgres:
image: "postgres:14"
command: postgres -c 'max_connections=200'
Expand Down
7 changes: 6 additions & 1 deletion docker-compose-gpu-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
- 127.0.0.1:8545:8545

zk:
image: "ghcr.io/matter-labs/zk-environment:cuda-11-8-latest"
image: "ghcr.io/matter-labs/zk-environment:cuda-11_8-latest"
container_name: zk
depends_on:
- reth
Expand All @@ -40,6 +40,11 @@ services:
- GITHUB_WORKSPACE=$GITHUB_WORKSPACE
env_file:
- ./.env
extra_hosts:
- "host:host-gateway"
profiles:
- runner
network_mode: host
deploy:
resources:
reservations:
Expand Down
Loading

0 comments on commit 2245b35

Please sign in to comment.