From 1b61d0797062ab8b0aa2c1e92b23a3a0d8fd2c61 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Fri, 26 Jul 2024 08:24:58 +0400 Subject: [PATCH] feat: New prover documentation (#2466) New prover documentation. [Rendered form](https://github.com/matter-labs/zksync-era/tree/popzxc-new-prover-docs/prover/docs/README.md) is recommended, as it's easier to view. Additionally, does the following: - Introduces ordering for advanced guides (it was a mix, not it's more or less sorted and suggests an order for reading). - Fixes a few things in the `setup-dev.md`. - Creates a `prover-local` config profile, so that no manual interactions with configs are needed. - Gets rid of `prover/setup.sh` as it's no longer needed that much. --- .gitignore | 1 + .../{contracts.md => 04_contracts.md} | 0 ...how_call_works.md => 05_how_call_works.md} | 0 ...n_works.md => 06_how_transaction_works.md} | 0 .../{fee_model.md => 07_fee_model.md} | 0 ..._works.md => 08_how_l2_messaging_works.md} | 0 .../advanced/{pubdata.md => 09_pubdata.md} | 0 ...with-blobs.md => 10_pubdata_with_blobs.md} | 0 .../{compression.md => 11_compression.md} | 0 ...vm_intro.md => 12_alternative_vm_intro.md} | 2 - .../{zk_intuition.md => 13_zk_intuition.md} | 0 ...r_overview.md => 14_zk_deeper_overview.md} | 4 +- .../{prover_keys.md => 15_prover_keys.md} | 0 ..._debugging.md => 90_advanced_debugging.md} | 0 .../{docker_and_ci.md => 91_docker_and_ci.md} | 0 docs/guides/advanced/README.md | 28 ++ docs/guides/setup-dev.md | 109 ++---- etc/env/base/fri_prover.toml | 2 +- etc/env/configs/prover-local.toml | 8 + prover/README.md | 41 +- prover/crates/bin/prover_fri/README.md | 351 +----------------- prover/docs/00_intro.md | 80 ++++ prover/docs/01_gcp_vm.md | 147 ++++++++ prover/docs/02_setup.md | 58 +++ prover/docs/03_launch.md | 101 +++++ prover/docs/04_flow.md | 238 ++++++++++++ prover/docs/99_further_reading.md | 13 + prover/docs/README.md | 16 + prover/setup.sh | 26 -- 29 files changed, 742 insertions(+), 483 deletions(-) rename docs/guides/advanced/{contracts.md => 04_contracts.md} (100%) rename docs/guides/advanced/{how_call_works.md => 05_how_call_works.md} (100%) rename docs/guides/advanced/{how_transaction_works.md => 06_how_transaction_works.md} (100%) rename docs/guides/advanced/{fee_model.md => 07_fee_model.md} (100%) rename docs/guides/advanced/{how_l2_messaging_works.md => 08_how_l2_messaging_works.md} (100%) rename docs/guides/advanced/{pubdata.md => 09_pubdata.md} (100%) rename docs/guides/advanced/{pubdata-with-blobs.md => 10_pubdata_with_blobs.md} (100%) rename docs/guides/advanced/{compression.md => 11_compression.md} (100%) rename docs/guides/advanced/{0_alternative_vm_intro.md => 12_alternative_vm_intro.md} (99%) rename docs/guides/advanced/{zk_intuition.md => 13_zk_intuition.md} (100%) rename docs/guides/advanced/{deeper_overview.md => 14_zk_deeper_overview.md} (99%) rename docs/guides/advanced/{prover_keys.md => 15_prover_keys.md} (100%) rename docs/guides/advanced/{advanced_debugging.md => 90_advanced_debugging.md} (100%) rename docs/guides/advanced/{docker_and_ci.md => 91_docker_and_ci.md} (100%) create mode 100644 docs/guides/advanced/README.md create mode 100644 etc/env/configs/prover-local.toml create mode 100644 prover/docs/00_intro.md create mode 100644 prover/docs/01_gcp_vm.md create mode 100644 prover/docs/02_setup.md create mode 100644 prover/docs/03_launch.md create mode 100644 prover/docs/04_flow.md create mode 100644 prover/docs/99_further_reading.md create mode 100644 prover/docs/README.md delete mode 100755 prover/setup.sh diff --git a/.gitignore b/.gitignore index 3ffddc7a7930..7b626c310d4b 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ Cargo.lock !/etc/env/configs/stage-proofs.toml !/etc/env/configs/testnet.toml !/etc/env/configs/mainnet.toml +!/etc/env/configs/prover-local.toml /etc/env/l1-inits !/etc/env/l1-inits/stage.env !/etc/env/l1-inits/stage_proofs.env diff --git a/docs/guides/advanced/contracts.md b/docs/guides/advanced/04_contracts.md similarity index 100% rename from docs/guides/advanced/contracts.md rename to docs/guides/advanced/04_contracts.md diff --git a/docs/guides/advanced/how_call_works.md b/docs/guides/advanced/05_how_call_works.md similarity index 100% rename from docs/guides/advanced/how_call_works.md rename to docs/guides/advanced/05_how_call_works.md diff --git a/docs/guides/advanced/how_transaction_works.md b/docs/guides/advanced/06_how_transaction_works.md similarity index 100% rename from docs/guides/advanced/how_transaction_works.md rename to docs/guides/advanced/06_how_transaction_works.md diff --git a/docs/guides/advanced/fee_model.md b/docs/guides/advanced/07_fee_model.md similarity index 100% rename from docs/guides/advanced/fee_model.md rename to docs/guides/advanced/07_fee_model.md diff --git a/docs/guides/advanced/how_l2_messaging_works.md b/docs/guides/advanced/08_how_l2_messaging_works.md similarity index 100% rename from docs/guides/advanced/how_l2_messaging_works.md rename to docs/guides/advanced/08_how_l2_messaging_works.md diff --git a/docs/guides/advanced/pubdata.md b/docs/guides/advanced/09_pubdata.md similarity index 100% rename from docs/guides/advanced/pubdata.md rename to docs/guides/advanced/09_pubdata.md diff --git a/docs/guides/advanced/pubdata-with-blobs.md b/docs/guides/advanced/10_pubdata_with_blobs.md similarity index 100% rename from docs/guides/advanced/pubdata-with-blobs.md rename to docs/guides/advanced/10_pubdata_with_blobs.md diff --git a/docs/guides/advanced/compression.md b/docs/guides/advanced/11_compression.md similarity index 100% rename from docs/guides/advanced/compression.md rename to docs/guides/advanced/11_compression.md diff --git a/docs/guides/advanced/0_alternative_vm_intro.md b/docs/guides/advanced/12_alternative_vm_intro.md similarity index 99% rename from docs/guides/advanced/0_alternative_vm_intro.md rename to docs/guides/advanced/12_alternative_vm_intro.md index fab623e38ae3..a36f0b560d33 100644 --- a/docs/guides/advanced/0_alternative_vm_intro.md +++ b/docs/guides/advanced/12_alternative_vm_intro.md @@ -2,8 +2,6 @@ ## zkEVM clarifier -[Back to ToC](../../specs/README.md) - The ZKsync zkEVM plays a fundamentally different role in the zkStack than the EVM does in Ethereum. The EVM is used to execute code in Ethereum's state transition function. This STF needs a client to implement and run it. Ethereum has a multi-client philosophy, there are multiple clients, and they are written in Go, Rust, and other traditional programming diff --git a/docs/guides/advanced/zk_intuition.md b/docs/guides/advanced/13_zk_intuition.md similarity index 100% rename from docs/guides/advanced/zk_intuition.md rename to docs/guides/advanced/13_zk_intuition.md diff --git a/docs/guides/advanced/deeper_overview.md b/docs/guides/advanced/14_zk_deeper_overview.md similarity index 99% rename from docs/guides/advanced/deeper_overview.md rename to docs/guides/advanced/14_zk_deeper_overview.md index 7fa4a009a920..8ec2c4d35c03 100644 --- a/docs/guides/advanced/deeper_overview.md +++ b/docs/guides/advanced/14_zk_deeper_overview.md @@ -1,6 +1,4 @@ -# Deeper Overview - -[Back to ToC](../../../README.md) +# Proof System Deeper Overview The purpose of this section is to explain our new proof system from an engineering standpoint. We will examine the code examples and how the libraries communicate. diff --git a/docs/guides/advanced/prover_keys.md b/docs/guides/advanced/15_prover_keys.md similarity index 100% rename from docs/guides/advanced/prover_keys.md rename to docs/guides/advanced/15_prover_keys.md diff --git a/docs/guides/advanced/advanced_debugging.md b/docs/guides/advanced/90_advanced_debugging.md similarity index 100% rename from docs/guides/advanced/advanced_debugging.md rename to docs/guides/advanced/90_advanced_debugging.md diff --git a/docs/guides/advanced/docker_and_ci.md b/docs/guides/advanced/91_docker_and_ci.md similarity index 100% rename from docs/guides/advanced/docker_and_ci.md rename to docs/guides/advanced/91_docker_and_ci.md diff --git a/docs/guides/advanced/README.md b/docs/guides/advanced/README.md new file mode 100644 index 000000000000..5a3673b558ad --- /dev/null +++ b/docs/guides/advanced/README.md @@ -0,0 +1,28 @@ +# ZKsync advanced guides + +This section contains more advanced guides that aim to explain complex internals of ZKsync ecosystem in an easy to grasp +way. + +## Table of Contents + +- [Local initialization](./01_initialization.md) +- [Deposits](./02_deposits.md) +- [Withdrawals](./03_withdrawals.md) +- [Contracts](./04_contracts.md) +- [Calls](./05_how_call_works.md) +- [Transactions](./06_how_transaction_works.md) +- [Fee model](./07_fee_model.md) +- [L2 messaging](./08_how_l2_messaging_works.md) +- [Pubdata](./09_pubdata.md) +- [Pubdata with blobs](./10_pubdata_with_blobs.md) +- [Bytecode compression](./11_compression.md) +- [EraVM intro](./12_alternative_vm_intro.md) +- [ZK intuition](./13_zk_intuition.md) +- [ZK deeper overview](./14_zk_deeper_overview.md) +- [Prover keys](./15_prover_keys.md) + +Additionally, there are a few articles that cover specific topics that may be useful for developers actively working on +`zksync-era` repo. + +- [Advanced debugging](./90_advanced_debugging.md) +- [Docker and CI](./91_docker_and_ci.md) diff --git a/docs/guides/setup-dev.md b/docs/guides/setup-dev.md index aafd96cda40a..12e8da7b022f 100644 --- a/docs/guides/setup-dev.md +++ b/docs/guides/setup-dev.md @@ -2,42 +2,67 @@ ## TL;DR -If you run on 'clean' Debian on GCP: +This is a shorter version of setup guide to make it easier subsequent initializations. If it's the first time you're +initializing the workspace, it's recommended that you read the whole guide below, as it provides more context and tips. + +If you run on 'clean' Ubuntu on GCP: ```bash +# For VMs only! They don't have SSH keys, so we override SSH with HTTPS +git config --global url."https://github.com/".insteadOf git@github.com: +git config --global url."https://".insteadOf git:// + # Rust curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh # NVM curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash # All necessary stuff -sudo apt-get install build-essential pkg-config cmake clang lldb lld libssl-dev postgresql -# Docker -sudo usermod -aG docker YOUR_USER +sudo apt-get update +sudo apt-get install build-essential pkg-config cmake clang lldb lld libssl-dev postgresql apt-transport-https ca-certificates curl software-properties-common +# Install docker +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable" +sudo apt install docker-ce +sudo usermod -aG docker ${USER} + +# Stop default postgres (as we'll use the docker one) +sudo systemctl stop postgresql +sudo systemctl disable postgresql +# Start docker. +sudo systemctl start docker ## You might need to re-connect (due to usermod change). # Node & yarn nvm install 20 +# Important: there will be a note in the output to load +# new paths in your local session, either run it or reload the terminal. npm install -g yarn yarn set version 1.22.19 +# For running unit tests +cargo install cargo-nextest # SQL tools cargo install sqlx-cli --version 0.7.4 -# Stop default postgres (as we'll use the docker one) -sudo systemctl stop postgresql -# Start docker. -sudo systemctl start docker # Foundry curl -L https://foundry.paradigm.xyz | bash foundryup --branch master +# You will need to reload your `*rc` file here + +# Clone the repo to the desired location +git clone git@github.com:matter-labs/zksync-era.git +cd zksync-era +git submodule update --init --recursive ``` +Don't forget to [add env variables](#Environment) and look at [tips](#tips). + ## Supported operating systems ZKsync currently can be launched on any \*nix operating system (e.g. any linux distribution or MacOS). -If you're using Windows, then make sure to use WSL 2, since WSL 1 is known to cause troubles. +If you're using Windows, then make sure to use WSL 2. Additionally, if you are going to use WSL 2, make sure that your project is located in the _linux filesystem_, since accessing NTFS partitions from within WSL is very slow. @@ -90,38 +115,9 @@ If logging out does not resolve the issue, restarting the computer should. ## `Node` & `Yarn` -1. Install `Node` (requires version `v18.18.0`). Since our team attempts to always use the latest LTS version of - `Node.js`, we suggest you to install [nvm](https://github.com/nvm-sh/nvm). It will allow you to update `Node.js` - version easily in the future (by running `nvm use v18.18.0` in the root of the repository) -2. Install `yarn` (make sure to get version 1.22.19 - you can change the version by running `yarn set version 1.22.19`). - Instructions can be found on the [official site](https://classic.yarnpkg.com/en/docs/install/). Check if `yarn` is - installed by running `yarn -v`. If you face any problems when installing `yarn`, it might be the case that your - package manager installed the wrong package.Make sure to thoroughly follow the instructions above on the official - website. It contains a lot of troubleshooting guides in it. - -## `Axel` - -Install `axel` for downloading keys: - -On mac: - -```bash -brew install axel -``` - -On debian-based linux: - -```bash -sudo apt-get install axel -``` - -Check the version of `axel` with the following command: - -``` -axel --version -``` - -Make sure the version is higher than `2.17.10`. +1. Install `Node` (requires version `v20`). The recommended way is via [nvm](https://github.com/nvm-sh/nvm). +2. Install `yarn`. Can be done via `npm install -g yarn`. Make sure to get version 1.22.19 - you can change the version + by running `yarn set version 1.22.19`. ## `clang` @@ -221,33 +217,9 @@ SQLx is a Rust library we use to interact with Postgres, and its CLI is used to features of the library. ```bash -cargo install --locked sqlx-cli --version 0.7.3 +cargo install --locked sqlx-cli --version 0.7.4 ``` -## Solidity compiler `solc` - -Install the latest solidity compiler. - -On mac: - -```bash -brew install solidity -``` - -On debian-based linux: - -```bash -sudo add-apt-repository ppa:ethereum/ethereum -sudo apt-get update -sudo apt-get install solc -``` - -Alternatively, download a [precompiled version](https://github.com/ethereum/solc-bin) and add it to your PATH. - -## Python - -Most environments will have this preinstalled but if not, install Python. - ## Easier method using `nix` Nix is a tool that can fetch _exactly_ the right dependencies specified via hashes. The current config is Linux-only but @@ -274,11 +246,10 @@ Edit the lines below and add them to your shell profile file (e.g. `~/.bash_prof export ZKSYNC_HOME=/path/to/zksync export PATH=$ZKSYNC_HOME/bin:$PATH - -# If you're like me, uncomment: -# cd $ZKSYNC_HOME ``` +## Tips + ### Tip: `mold` Optionally, you may want to optimize the build time with the modern linker, [`mold`](https://github.com/rui314/mold). diff --git a/etc/env/base/fri_prover.toml b/etc/env/base/fri_prover.toml index 1578a7f66e38..1c93752251bc 100644 --- a/etc/env/base/fri_prover.toml +++ b/etc/env/base/fri_prover.toml @@ -1,5 +1,5 @@ [fri_prover] -setup_data_path = "vk_setup_data_generator_server_fri/data" +setup_data_path = "crates/bin/vk_setup_data_generator_server_fri/data" prometheus_port = 3315 max_attempts = 10 generation_timeout_in_secs = 600 diff --git a/etc/env/configs/prover-local.toml b/etc/env/configs/prover-local.toml new file mode 100644 index 000000000000..1850871bc2c8 --- /dev/null +++ b/etc/env/configs/prover-local.toml @@ -0,0 +1,8 @@ +# Config for running prover locally +__imports__ = [ "base", "l1-inits/.init.env", "l2-inits/dev.init.env" ] + +[eth_sender.sender] +proof_sending_mode = "OnlyRealProofs" + +[fri_prover] +cloud_type = "Local" diff --git a/prover/README.md b/prover/README.md index 5e537bf8bc0b..98ee4edb78e9 100644 --- a/prover/README.md +++ b/prover/README.md @@ -2,36 +2,11 @@ This directory contains all the libraries and binaries related to proving of the blocks. -Directories with 'fri' suffix, are mostly used with the new proof system (Boojum). - -## Components - -### vk_setup_data_generator_server_fri - -Set of tools to create setup keys, verification keys and verification key hashes for the circuits. - -Usually run once, and then we use their outputs in multiple places in the system. - -### prover_fri_gateway - -Communication module between the 'main' server running the state keeper, and the proving subsystem. - -### witness_generator - -Creating prover jobs and saving necessary artifacts. - -### prover_fri - -This directory contains the main 'prover'. It can be run in two modes: either as CPU or as GPU. (controlled via 'gpu' -feature flag). - -### witness_vector_generator - -Only used in GPU proving mode. Prepares all the witness data using CPU, and then streams it to the prover_fri. - -This is mostly used for resource efficiency (as machines with GPUs are more expensive, it allows us to run many -witness_vector_generators, that can 'share' as single gpu based prover_fri). - -### proof_fri_compressor - -Used as a 'last step' to compress/wrap the final FRI proof into a SNARK (to make L1 verification cheaper). +## Documentation + +- [Intro](00_intro.md) +- [Setting up a GCP VM](01_gcp_vm.md) +- [Workspace setup](02_setup.md) +- [Running prover subsystem](03_launch.md) +- [Proof generation flow](04_flow.md) +- [Further reading](99_further_reading.md) diff --git a/prover/crates/bin/prover_fri/README.md b/prover/crates/bin/prover_fri/README.md index 141b058172f7..6a802cbcd8e1 100644 --- a/prover/crates/bin/prover_fri/README.md +++ b/prover/crates/bin/prover_fri/README.md @@ -1,352 +1,5 @@ # FRI Prover -## Overview of the pipeline +Implementation of the circuit prover. -These are the main components to this process: - -- Sequencer -- Prover gateway -- Witness -- Prover -- Compressor - -All of them will be sharing information through a SQL database and GCS bucket. The general idea is that the sequencer -will produce blocks and the gateway will place them into the database to be proven. Then, the rest of the components -will pull jobs from the database and do their part of the pipeline, loading intermediary artifacts from GCS. - -```mermaid -flowchart LR - A["Operator"] -->|Produces block| F[Prover Gateway] - F -->|Inserts into DB| B["Postgres DB"] - B -->|Retrieves proven block \nafter compression| F - B --> C["Witness"] - C --- C1["Basic Circuits"] - C --- C2["Leaf Aggregation"] - C --- C3["Node Aggregation"] - C --- C4["Recursion Tip"] - C --- C5["Scheduler"] - C --> B - B --> D["Vector Generator/Prover"] - D -->|Proven Block| B - B --> G["Compressor"] - G -->|Compressed block| B -``` - -## Prerequisites - -Make sure these dependencies are installed and available on your machine: -[Installing dependencies](../../docs/guides/setup-dev.md). Make sure you go through all steps, including setting -environment variables for `zk`. Same work is done at the bottom of this doc, if you want a TL;DR; for running GPU -provers on GCP. - -## Proving a block using GPU prover locally - -Below steps can be used to prove a block on local machine using GPU prover. Running a GPU prover requires a CUDA 12.0 -installation as a pre-requisite, alongside these machine specs: - -- CPU: At least 16 physical cores -- RAM: 85GB of RAM -- Disk: 200GB of free disk (lower might be fine, depending on how many proofs you want to generate) -- GPU: NVIDIA GPU with CUDA support and at least 6GB of VRAM, we recommend to use GPUs with at least 16GB VRAM for - optimal performance. In our GPU picks for datacenters while running on Google Cloud Platform, the L4 takes the top - spot in terms of price-to-performance ratio, with the T4 coming in second. - -1. Initialize DB and run migrations (make sure you're in the root of the repo): `zk && zk init` -2. Run the server. In the root of the repository: - - ```console - zk server --components=api,eth,tree,state_keeper,housekeeper,commitment_generator,proof_data_handler,vm_runner_bwip - ``` - - Note that it will produce a first l1 batch that can be proven (should be batch 0). - -3. Generate the GPU setup data (no need to regenerate if it's already there). If you want to use this with the GPU - compressors, you need to change the key in the file from `setup_2^26.key` to `setup_2^24.key`. This will consume - around 20GB of disk. You need to be in the `prover/` directory (for all commands from here onwards, you need to be in - the `prover/` directory) and run: - - ```console - ./setup.sh gpu - ``` - -4. Run prover gateway to fetch blocks to be proven from server: - - ```console - zk f cargo run --release --bin zksync_prover_fri_gateway - ``` - -5. Run 4 witness generators to generate witness for each round: - - ```console - API_PROMETHEUS_LISTENER_PORT=3116 zk f cargo run --release --bin zksync_witness_generator -- --round=basic_circuits - API_PROMETHEUS_LISTENER_PORT=3117 zk f cargo run --release --bin zksync_witness_generator -- --round=leaf_aggregation - API_PROMETHEUS_LISTENER_PORT=3118 zk f cargo run --release --bin zksync_witness_generator -- --round=node_aggregation - API_PROMETHEUS_LISTENER_PORT=3119 zk f cargo run --release --bin zksync_witness_generator -- --round=recursion_tip - API_PROMETHEUS_LISTENER_PORT=3120 zk f cargo run --release --bin zksync_witness_generator -- --round=scheduler - ``` - - or alternatively (recommended), start all of them with - - ```console - API_PROMETHEUS_LISTENER_PORT=3116 zk f cargo run --release --bin zksync_witness_generator -- --all_rounds - ``` - - Note that this will automatically open four ports: 3116 (the starting port), 3117, 3118 and 3119 for subsequent - provers. - -6. Run witness vector generators to feed jobs to GPU prover: - - ```console - FRI_WITNESS_VECTOR_GENERATOR_PROMETHEUS_LISTENER_PORT=3420 zk f cargo run --release --bin zksync_witness_vector_generator - ``` - - Note that you may run multiple of them (as 1 prover roughly can be fed by 10 vector generators). Make sure to use a - different port! - -7. Run prover to perform actual proving: `zk f cargo run --features "gpu" --release --bin zksync_prover_fri` - -8. Finally, run proof compressor to compress the proof to be sent on L1: - `zk f cargo run --release --bin zksync_proof_fri_compressor` - -## Block proving with CPU - -We don't recommend using this method, as at the moment none are ran in production and may be broken. There will be -investment in the future, but for the time being, please use GPU provers. That said, instructions are left below for -brave adventurers. - -Below steps can be used to prove a block on local machine using CPU prover. This is useful for debugging and testing -Machine specs: - -- CPU: At least 8 physical cores -- RAM: 80GB of RAM (enable swap if your machine has less RAM) -- Disk: 400GB of free disk - -1. Install Rust (correct version from rust-toolchain file should be used automatically if you don't have any local - overrides) -2. Initialize DB and run migrations. Go into the root of the repository, then run - - ```console - zk init - ``` - -3. Generate the CPU setup data (no need to regenerate if it's already there). This will consume around 300GB of disk. - For this, move to the `prover` directory, and run - - ```console - ./setup.sh - ``` - - For the following steps, we recommend using `tmux` to run every command on a separate session, so you can attach to - and monitor logs for each one. - -4. Run the sequencer/operator. In the root of the repository: - - ```console - zk server --components=api,eth,tree,state_keeper,housekeeper,commitment_generator,proof_data_handler - ``` - - to produce blocks to be proven - -5. Move again into the `prover` directory. The rest of the steps will be performed from there. Run prover gateway to - fetch blocks to be proven from server: - - ```console - zk f cargo run --release --bin zksync_prover_fri_gateway - ``` - -6. Run 4 witness generators to generate witness for each round: - -7. Run prover to perform actual proving: - - ```console - zk f cargo run --release --bin zksync_prover_fri - ``` - -8. Finally, run proof compressor to compress the proof to be sent on L1: - - ```console - zk f cargo run --release --bin zksync_proof_fri_compressor - ``` - -## Running GPU compressors - -There is an option to run compressors with the GPU, which will significantly improve the performance. - -1. The hardware setup should be the same as for GPU prover -2. Install and compile `era-bellman-cuda` library - - ```console - git clone https://github.com/matter-labs/era-bellman-cuda - cmake -Bera-bellman-cuda/build -Sera-bellman-cuda/ -DCMAKE_BUILD_TYPE=Release - cmake --build bellman-cuda/build/ - ``` - -3. Set path of library as environmental variable - - ```console - export BELLMAN_CUDA_DIR=$PWD/bellman-cuda - ``` - -4. GPU compressor uses `setup_2^24.key`. Download it by using: - - ```console - wget https://storage.googleapis.com/matterlabs-setup-keys-us/setup-keys/setup_2^24.key - ``` - -5. Set the env variable with it's path: - - ```console - export CRS_FILE=$PWD/setup_2^24.key - ``` - -6. Run the compressor using: - - ```console - zk f cargo run --features "gpu" --release --bin zksync_proof_fri_compressor - ``` - -## Checking the status of the prover - -Once everything is running (either with the CPU or GPU prover), the server should have at least three blocks, and you -can see the first one by running - -```console -curl -X POST -H 'content-type: application/json' localhost:3050 -d '{"jsonrpc": "2.0", "id": 1, "method": "zks_getBlockDetails", "params": [0]}' -``` - -and then similarly for blocks number `1` and `2` by changing the parameters. - -The prover gateway will then fetch block number 1 to prove and start the entire proving pipeline, which starts out by -generating the witness, then passing it to the prover, then to the compressor to wrap it inside a SNARK to send to L1. - -You can follow the status of this pipeline by running - -```console -zk status prover -``` - -This might take a while (around an hour and a half on my machine using the CPU prover), you can check on it once in a -while. A successful flow should output something like - -```console -==== FRI Prover status ==== -State keeper: First batch: 0, recent batch: 1 -L1 state: block verified: 1, block committed: 1 -Verification key hash on contract is 0x4be443afd605a782b6e56d199df2460a025c81b3dea144e135bece83612563f2 -Verification key in database is 0x4be443afd605a782b6e56d199df2460a025c81b3dea144e135bece83612563f2 -Verifier hash matches. -Verifier params on contract are 0x5a3ef282b21e12fe1f4438e5bb158fc5060b160559c5158c6389d62d9fe3d080, 0x72167c43a46cf38875b267d67716edc4563861364a3c03ab7aee73498421e828, 0x0000000000000000000000000000000000000000000000000000000000000000 -Verification params match. -Next block that should be verified is: 2 -Checking status of the proofs... -Proof progress for 1 : 111 successful, 0 failed, 0 in progress, 0 queued. Compression job status: successful -``` - -The most important thing here is the following line - -```console -L1 state: block verified: 1, block committed: 1 -``` - -which means the proof for the block was verified on L1. - -## Performing circuit upgrade - -Performing circuit upgrade requires crypto library to be updated and generating new setup data, verification key, -finalization hints if the circuit changes. Below steps can be used to perform circuit upgrade: - -1. checkout if the circuit geometry has changed in the new version of the circuit by running the - [workflow](https://github.com/matter-labs/era-zkevm_test_harness/blob/v1.4.0/.github/workflows/.github/workflows/geometry-config-generator.yml) - in harness and merge the generated PR. -2. update the relevant crypto dependencies(boojum, zkevm_circuit, harness, etc) in `Cargo.lock`, for example: - `cargo update -p zkevm_test_harness@1.4.0` -3. prepare an PR with the updated dependencies [sample PR](https://github.com/matter-labs/zksync-2-dev/pull/2481). -4. Run the verification key - [workflow](https://github.com/matter-labs/zksync-era/actions/workflows/fri-vk-generator.yaml) against the PR to - generate the verification key and finalization hints for the new circuit. -5. Only once the above verification key workflow is successful, start the setup-data generation(CPU, GPU setup data - generation can be done in parallel), this step is important, since the setup data requires the new VK, we need to - wait for it to finish. -6. Run the CPU setup data generation - [workflow](https://github.com/matter-labs/zksync-era/actions/workflows/fri-setup-data-generator.yml) against the PR - to generate the CPU setup data. -7. Run the GPU setup data generation - [workflow](https://github.com/matter-labs/zksync-era/actions/workflows/fri-gpu-setup-data-generator.yml) against the - PR to generate the GPU setup data. -8. Once the setup data generation workflows are successful, update the PR with `setup_keys_id` id in - [build-docker-from-tag.yml](../../.github/workflows/build-docker-from-tag.yml) and in - [build-prover-fri-gpu-gar.yml](https://github.com/matter-labs/zksync-era/blob/main/.github/workflows/build-prover-fri-gpu-gar.yml), - make sure to only do it from `FRI prover` not old. - -## Quick Machine Setup for GPU proving on GCP - -``` -# As of 11th of March, 2024 - -# Go to GCP -> pick a project -> compute engine -> create instance -# Give the machine a name -# Go to GPUs and select Nvidia L4, g2-standard-32 (32 vCPUs, 16 core, 128 GB memory) -# Boot disk, select Ubuntu, Ubuntu 22.04 (x86), select SSD persistent disk and change size to 200GB - -# You should have the machine available, that you can SSH into. Assuming you're SSHed in from this point forward - -# Install Rust -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh - -# Install cMake from https://apt.kitware.com/ -- not listing steps as they may change -... - -# Install cuda -- again not listing steps as they may change -- https://developer.nvidia.com/cuda-downloads -- make sure to select Linux, x86_64, Ubuntu, 22.04, deb(network) and follow through -... - -# Make sure to make the nvidia software available -echo 'export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}' >> ~/.bashrc - -# Reboot for the drivers to kick-in -sudo reboot - -# From here, you can follow-up the instructions from the main setup doc `core/docs/guides/setup-dev.md`; a TL;DR; is: - -# Install NVM -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash - -# Install dependencies -sudo apt-get install -y build-essential pkg-config cmake clang lldb lld libssl-dev postgresql docker docker-compose-v2 axel - -# Make docker work -sudo usermod -aG docker YOUR_USER - -# Make sure you have all binaries loaded in your environment -source ~/.bashrc - -# Setup the node part -nvm install 18 -npm install -g yarn -yarn set version 1.22.19 - -# Install SQLX for database management -cargo install sqlx-cli --version 0.7.3 - -# Get solidity working -sudo add-apt-repository ppa:ethereum/ethereum -sudo apt-get update -sudo apt-get install solc - -# Make zk work -- insert below into ~/.bashrc -export ZKSYNC_HOME=/path/to/zksync - -export PATH=$ZKSYNC_HOME/bin:$PATH - -# Let's get the last bits of the environment in the desired state (stop postgres, as we use it in docker and start docker) -sudo systemctl stop postgresql -sudo systemctl disable postgresql -sudo systemctl start docker - -sudo reboot - -# Of course, let's get the code -git clone https://github.com/matter-labs/zksync-era.git - -# Load everything in the env and you're good to go -source ~/.bashrc && cd ~/zksync-era -``` +GPU circuit proving is the only maintained implementation. CPU circuit proving has been deprecated. diff --git a/prover/docs/00_intro.md b/prover/docs/00_intro.md new file mode 100644 index 000000000000..fb79cf5bed0e --- /dev/null +++ b/prover/docs/00_intro.md @@ -0,0 +1,80 @@ +# Prover subsystem introduction + +The prover subsystem consists of several binaries that perform different steps of the batch proof generation process, as +follows: + +- [Prover gateway][pg]: interface between core and prover subsystems, fetches batch jobs from core, and sends batch + proofs back to core. +- [Witness generator][wg]: component that takes batch information (tx execution/state diffs/computation results) and + constructs witness for proof generation. +- [Witness vector generator][wvg]: component that uses witness generator output and computes witness vector (_roughly_: + data to be fed into GPU) for circuit provers. +- [Circuit prover][p]: component that generates a circuit proof (GPU accelerated). +- [Proof compressor][pc]: component that "wraps" the generated proof so that it can be sent to L1 (GPU accelerated). + +While not technically a part of the prover workspace, the following components are essential for it: + +- [Proof data handler][pdh]: API on the core side which Prover gateway interacts with. +- [House keeper][hk]: Metrics exporter and job rescheduler. In it's absence, jobs would not be rescheduled and metrics + used for autoscaling would not exist, rendering internal autoscaling infrastructure useless. + +Finally, the prover workspace has several CLI tools: + +- [Circuit key generator][vkg]: CLI used to generate keys required for proving. +- [Prover CLI][pcli]: CLI for observing and maintaining the production proving infrastructure. + +There are core components that also participate in the proof generation process by preparing the input data, such as +[metadata calculator][mc], [commitment generator][cg], [basic witness input producer][bwip], and [protective reads +writer][prw]. We won't cover them much in these docs, but it's better to know that they exist and are important for the +prover subsystem as well. + +We'll cover how the components work further in documentation. + +[pg]: ../crates/bin/prover_fri_gateway/ +[wg]: ../crates/bin/witness_generator/ +[wvg]: ../crates/bin/witness_vector_generator/ +[p]: ../crates/bin/prover_fri/ +[pc]: ../crates/bin/proof_fri_compressor/ +[pdh]: ../../core/node/proof_data_handler/ +[hk]: ../../core/node/house_keeper/ +[vkg]: ../crates/bin/prover_cli/ +[pcli]: ../crates/bin/vk_setup_data_generator_server_fri/ +[mc]: ../../core/node/metadata_calculator/ +[cg]: ../../core/node/commitment_generator/ +[bwip]: ../../core/node/vm_runner/src/impls/bwip.rs +[prw]: ../../core/node/vm_runner/src/impls/protective_reads.rs + +## How it runs + +Proof generation is a multi-stage process, where the initial jobs are created by the Prover gateway, and then moved by +the House Keeper until the proof is generated. + +The real-life deployment of prover subsystem looks as follows: + +- 1x prover gateway +- 1x house keeper +- Many witness generators +- Many witness vector generators +- Many circuit provers +- 1+ proof compressors + +Currently, the proving subsystem is designed to run in GCP. In theory, it's mostly environment-agnostic, and all of the +components can be launched locally, but more work is needed to run a production system in a distributed mode outside of +GCP. + +Witness generators, witness vector generators, and provers are spawned on demand based on the current system load via an +autoscaler (WIP, so not released publicly yet). They can be spawned in multiple clusters among different zones, based on +the availability of machines with required specs. + +## How to develop + +Different parts of the subsystem have different hardware requirement, but the aggregated summary to be able to run +everything on a single machine is as follows: + +- CPU with 16+ physical cores. +- GPU with CUDA support and at least 24 GB of VRAM. +- At least 64GB of RAM. +- 200+ GB of disk space. 400+ GB is recommended for development, as `/target` directory can get quite large. + +Given that the requirements are quite high, it's often more convenient developing the prover in a GCP VM rather than on +a local machine. Setting up a VM is covered further in docs. diff --git a/prover/docs/01_gcp_vm.md b/prover/docs/01_gcp_vm.md new file mode 100644 index 000000000000..a541495e978a --- /dev/null +++ b/prover/docs/01_gcp_vm.md @@ -0,0 +1,147 @@ +# Creating a GCP VM + +In this section we will cover the creation of a VM suitable for prover development. We assume that you already have +access to the GCP cluster. + +## When you need a VM + +Generally, you don't always need a VM to work on prover. You typically need it to either modify the code under +`cfg(feature = "gpu")` flag, or when you need to run some tests. Moreover, VMs are shared, e.g. many people have access +to them, and you can't store sensitive data (like SSH keys) there, so they can't be used as primary workstations. +Finally, the VMs with GPU aren't cheap, so we expect you to use them when you really need them. + +A typical workflow so far is to instantiate a new VM when you need it, and remove once you're done. Remember: even if +the VM is stopped, the SSD is persisted, so it's not free. + +## Create a VM + +Open [Google cloud console](https://console.cloud.google.com/) and choose "Compute Engine". + +On the "Compute Engine" page choose the cluster suitable for creating VMs with GPU, and then click on "Create instance". + +We will need an GPU **L4** instance, so find the zone that is close to you geographically and has such instances. At the +time of writing, `europe-west2` is one of the possible options. L4 is recommended as the cheapest option, but you may +use a beefier machine if you need it. + +When you choose the region, set the following options: + +- Name: A descriptive name that contains your name, e.g. `john-doe-prover-dev-machine`. +- Region and zone: Values you've found above. +- Machine configuration: "GPUs", then: + - GPU Type: NVIDIA L4 + - Number of GPUs: 1 + - Machine type: Preset, `g2-standard-16` +- Availability policies: Spot instances are much cheaper, but there is a chance that it will be preempted while you work + on it. If you're working on something that is not very important, spot instances are recommended. If any disruption + will be harmful, choose standard provisioning. +- Then click on "VM provisioning model advanced settings" and + - Click on "Set a time limit for the VM" + - Set the limit to 8 hours +- On VM termination: Stop +- Boot disk: Click on "Change", then: + - Operating system: Ubuntu + - Version: Ubuntu 22.04 LTS (x86/64) + - Boot disk type: SSD persistent disk + - Size: 300GB + +Leave the remaining options as is and click on "Create". + +You will have to wait a bit and then your instance will be created. Once you see that the machine is running, click on +an arrow near "SSH" in the list of options, and choose "Open in browser window". + +You should successfully connect to your machine now. + +⚠️ Don't forget to remove the VM once you've finished your scope of work. It's OK to keep the machine if you expect to +work with it on the next working day, but otherwise it's better to remove and create a new one when needed. + +## Adding your own ssh key (on local machine) + +Using browser to connect to the machine may not be the most convenient option. Instead, we can add an SSH key to be able +to connect there. + +It is highly recommended to generate a new SSH key specifically for this VM, for example: + +``` +ssh-keygen -t rsa -f ~/.ssh/gcp_vm -C -b 2048 +``` + +...where "your work email" is the same email you use to access GCP. + +Check the contents of the public key: + +``` +cat ~/.ssh/gcp_vm.pub +``` + +Click on your machine name, then click on "Edit". Scroll down until you see "SSH Keys" section and add the generated +public key there. Then save. + +Get back to the list of VMs and find the external IP of your VM. Now you should be able to connect to the VM via ssh. +Assuming that your work email is `abc@example.com` and the external IP is 35.35.35.35: + +``` +ssh -i ~/.ssh/gcp_vm abc@35.35.35.35 +``` + +## Make the VM cozy + +If you intend to use the VM somewhat regularly, install all the tools you would normally install on your own machine, +like `zsh` and `nvim`. + +It is also _highly recommended_ to install `tmux`, as you will have to run multiple binaries and observe their output. +If you don't know what is it or why should you care, watch [this video](https://www.youtube.com/watch?v=DzNmUNvnB04). + +Native `tmux` may be hard to use, so you may also want to install some configuration for it, e.g. + +- [oh-my-tmux](https://github.com/gpakosz/.tmux) or +- [tmux-sensible](https://github.com/tmux-plugins/tmux-sensible). + +Finally, it is recommended to choose a different terminal theme or prompt than what you use locally, so that you can +easily see whether you're running in the VM or locally. + +## Connecting via VS Code + +VS Code can connect to VMs via SSH, so you can have the comfort of using your own IDE while still running everything on +a remote machine. + +If you're using WSL, note that VS Code will have to look up the keys in Windows, so you will have to copy your keys +there as well, e.g.: + +``` +cp ~/.ssh/gcp_vm* /mnt/c/Users/User/.ssh +``` + +Then, when you open a fresh VS Code window, in the "Start" section: + +- Choose "Connect to Host" +- Click on "Configure Hosts" +- Create a host entry. + +Host entry looks as follows: + +``` +Host + HostName + IdentityFile + User +``` + +E.g. for the command we've used as an example before: `ssh -i ~/.ssh/gcp_vm abc@35.35.35.35`, the file will be: + +``` +Host gcp_vm + HostName 35.35.35.35 + IdentityFile ~/.ssh/gcp_vm + User abc +``` + +Once you've configured the host, you can click on "Connect to" again, then "Connect to Host", and your VM should be +listed there. On the first connect you'll have to confirm that you want to connect to it, and then choose the operating +system (Linux). + +## On security + +Do not store SSH keys, tokens, or other private information on GCP VMs. Do not use SSH keys forwarding either. These VMs +are shared, and every person has root access to all the VMs by default. + +You may, however, use tools like `rsync` or `sshfs`. diff --git a/prover/docs/02_setup.md b/prover/docs/02_setup.md new file mode 100644 index 000000000000..67c2b0b945ff --- /dev/null +++ b/prover/docs/02_setup.md @@ -0,0 +1,58 @@ +# Development environment setup + +In this section, we cover installing prerequisites for running prover subsystem. We assume that you have a prepared +machine in place, e.g. a compatible local machine or a prepared GCP VM. + +## ZKsync repo setup + +If you haven't already, you need to initialize the ZKsync repository first. Follow +[this guide](../../docs/guides/setup-dev.md) for that. + +Before proceeding, make sure that you can run the server and integration tests pass. + +## Prover-specific prerequisites + +### Cmake 3.24 or higher + +Use [Kitware APT repository](https://apt.kitware.com/). + +### CUDA runtime + +If you're using a local machine, make sure that you have up-to-date GPU driver. + +Use [Official CUDA downloads](https://developer.nvidia.com/cuda-downloads). + +Choose: OS -> Linux -> x86_64 -> Ubuntu (For WSL2 choose WSL-Ubuntu) -> 22.04 -> deb (network). + +Install both the base and driver (kernel module flavor). + +Setup environment variables: add the following to your configuration file (`.bashrc`/`.zshrc`): + +``` +# CUDA +export CUDA_HOME=/usr/local/cuda +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 +export PATH=$PATH:$CUDA_HOME/bin +``` + +Reboot for the drivers to kick-in. + +### Bellman-CUDA + +Bellman-CUDA is a library required for GPU proof compressor. + +Navigate to some directory where you want to store the code, and then do the following: + +``` +git clone git@github.com:matter-labs/era-bellman-cuda.git +cmake -Bera-bellman-cuda/build -Sera-bellman-cuda/ -DCMAKE_BUILD_TYPE=Release +cmake --build era-bellman-cuda/build/ +``` + +After that add the following environment variable to your config (`.bashrc`/`.zshrc`): + +``` +export BELLMAN_CUDA_DIR=/era-bellman-cuda +``` + +Don't forget to reload it (e.g. `source ~/.zshrc`). diff --git a/prover/docs/03_launch.md b/prover/docs/03_launch.md new file mode 100644 index 000000000000..2c5809e994e5 --- /dev/null +++ b/prover/docs/03_launch.md @@ -0,0 +1,101 @@ +# Running provers + +## Preparing + +First, run the following command: + +``` +zk env prover-local +``` + +It will create a config similar to `dev`, but with: + +- Proof sending mode set to `OnlyRealProofs` +- Prover mode set to `Local` instead of `GCS`. + +You can always switch back to dev config via `zk env dev`. + +## Enter the prover workspace + +All the commands for binaries in the prover workspace must be done from the prover folder: + +``` +cd $ZKSYNC_HOME/prover +``` + +## Key generation + +This operation should only be done once; if you already generated keys, you can skip it. + +The following command will generate the required keys: + +``` +zk f cargo run --features gpu --release --bin key_generator -- generate-sk-gpu all --recompute-if-missing +``` + +With that, you should be ready to run the prover. + +## Running + +Important! Generating a proof takes a lot of time, so if you just want to see whether you can generate a proof, do it +against clean sequencer state (e.g. right after `zk init`). + +We will be running a bunch of binaries, it's recommended to run each in a separate terminal. + +### Server + +``` +zk server --components=api,tree,eth,state_keeper,housekeeper,tee_verifier_input_producer,commitment_generator,da_dispatcher,proof_data_handler,vm_runner_protective_reads,vm_runner_bwip +``` + +### Proof data handler + +``` +zk f cargo run --release --bin zksync_prover_fri_gateway +``` + +Then wait until the first job is picked up. Prover gateway has to insert protocol information into the database, and +until it happens, witness generators will panic and won't be able to start. + +### Witness generator + +Once a job is created, start witness generators: + +``` +API_PROMETHEUS_LISTENER_PORT=3116 zk f cargo run --release --bin zksync_witness_generator -- --all_rounds +``` + +`--all_rounds` means that witness generator will produce witnesses of all kinds. You can run a witness generator for +each round separately, but it's mostly useful in production environments. + +### Witness vector generator + +``` +FRI_WITNESS_VECTOR_GENERATOR_PROMETHEUS_LISTENER_PORT=3420 zk f cargo run --release --bin zksync_witness_vector_generator +``` + +WVG prepares inputs for prover, and it's a single-threaded time-consuming operation. You may run several instances (make +sure to use different ports). The exact amount of WVGs needed to "feed" one prover depends on CPU/GPU specs, but a +ballpark estimate (useful for local development) is 10 WVGs per prover. + +### Prover + +``` +zk f cargo run --features "gpu" --release --bin zksync_prover_fri +``` + +Prover can prove any kinds of circuits, so you only need a single instance. + +### Proof compressor + +⚠️ Both prover and proof compressor require 24GB of VRAM, and currently it's not possible to make them use different +GPU. So unless you have a GPU with 48GB of VRAM, you won't be able to run both at the same time. + +You should wait until the proof is generated, and once you see in the server logs that it tries to find available +compressor, you can shut the prover down, and run the proof compressor: + +``` +zk f cargo run --features "gpu" --release --bin zksync_proof_fri_compressor +``` + +Once the proof is compressed, proof gateway will see that and will send the generated proof back to core. diff --git a/prover/docs/04_flow.md b/prover/docs/04_flow.md new file mode 100644 index 000000000000..9bb5ebacbc40 --- /dev/null +++ b/prover/docs/04_flow.md @@ -0,0 +1,238 @@ +# Prover flow + +In this section, we're going to learn what stages does the proof generation process have. It's a complex process, so +we'll be looking at it from four perspectives: + +- Core<->Prover subsystem interactions. +- Core side of workflow. +- Prover pipeline. +- Batch proof generation. +- Infrastructure distribution. + +After that, we will touch on how this flow is mapped on the actual production infrastructure. + +## Core <-> Prover subsystem interactions + +Core and prover subsystem are built in such a way that they are mostly isolated from each other. Each side has its own +database and GCS buckets, and both have "gateway" components they use for interaction. + +The only exception here is the `house_keeper`: it's a component that exists as a part of the server, it's main purpose +is to manage jobs (and emit metrics for job management) in the prover workspace, but at the same time it has access to +both core and prover databases. The component will probably be split in the future and most of it will be moved to the +prover workspace. + +Otherwise, the interaction between subsystems can be expressed as follows: + +```mermaid +sequenceDiagram + participant C as Core + participant P as Prover + + loop In parallel, for each batch + P-->>+C: Get a job to prove + C->>-P: Unproven batch + P->>P: Calculate proof + P->>C: Submit proof + end +``` + +Core exposes an API, and Prover repeatedly polls this API, fetching new batch proof jobs and submitting batch proofs. + +## Core side of workflow + +Despite the fact that the prover is isolated from the core, the core has multiple components specifically designed to +prepare _inputs_ for proving. + +The following diagram shows what happens under the hood when the prover subsystem requests a new job: + +```mermaid +sequenceDiagram + box Core + participant Ob as GCS + participant DB as Core database + participant API as Proof data handler + end + participant P as Prover + P-->>+API: Get a job + API-->>DB: Lock a suitable job + DB->>API: Job is marked as "picked_up" + API-->>Ob: Fetch BWIP data + Ob->>API: Return BWIP data + API-->>Ob: Fetch Merkle Tree data + Ob->>API: Return Merkle Tree data + API-->>DB: Fetch batch metadata + DB->>API: Return batch metadata + API->>-P: Return a job +``` + +First of all, `proof_data_handler` will check if all the data required for the proof generation is already prepared by +the core. If so, it will lock the job so that it's not assigned twice, and will fetch required information from multiple +sources. Then this data is given to the prover together with the batch number. + +## Prover pipeline + +Once job is received by the prover, it has to go through several different stages. Consider this a mental model of the +pipeline, since in reality some stages happen in parallel, and some have different degree of sequencing. + +```mermaid +sequenceDiagram +participant C as Core +box Prover +participant PG as Gateway +participant BPG as Basic WG+Proving +participant LPG as Leaf WG+Proving +participant NPG as Node WG+Proving +participant RTPG as Recursion tip WG+Proving +participant SPG as Scheduler WG+Proving +participant CP as Compressor +end +C-->>PG: Job +PG->>BPG: Batch data +BPG->>LPG: Basic proofs +LPG->>NPG: Aggregated proofs (round 1) +NPG->>NPG: Internal aggregation to get 1 proof per circuit type +NPG->>RTPG: Aggregated proofs (round 2) +RTPG->>SPG: Aggregated proofs (round 3) +SPG->>CP: Aggregated proof (round 4) +CP->>PG: SNARK proof +PG-->>C: Proof +``` + +When we process the initial job (during basic witness generation) we create many sub-jobs for basic proof generation. +Once they are processed, we start to aggregate generated proofs, and we do it in "levels". With each aggregation level, +we reduce the number of jobs. + +Aggregation levels are commonly referred by numbers in the prover workspace, from 0 to 4. So if someone mentions +"aggregation round 2", they refer to the "node" stage, and round 4 corresponds to the "scheduler" stage. Proof +compression is considered separate operation, and doesn't have a numeric value. + +Jobs within the aggregation round may also have different types, but this will be covered later. + +The actual numbers may vary, but just for example there might exist a batch, so that it initially creates 10000 jobs, +which are processed as follows: + +- On round 0, we also emit 10000 jobs. We aren't doing "actual" aggregation here. +- On round 1, we're turning 10000 jobs into 100. +- On round 2, we should turn these 100 jobs into at most 16. Depending on the batch parameters, it may required + additional "iterations" of the stage. For example, after we processed the initial 100 jobs, we may get 35 proofs. + Then, additional node level jobs will be created, until we reduce the number to at most 16. +- On round 3, we're turning 16 jobs into 1. +- On round 4, we already have just 1 job, and we produce a single aggregated proof. +- Finally, the proof is processed by the proof compressor and sent back to the core. + +Once again, these numbers are just for example, and don't necessarily represent the actual state of affairs. The exact +number of jobs depend on number of txs in a batch (and what's done inside those txs) while the aggregation split +(mapping of `N circuits of level X` to `M circuits of level X + 1`) is determined by the config geometry. + +## Actual proof generation + +Every "job" we mentioned has several sub-stages. More precisely, it receives some kind of input, which is followed by +witness generation, witness vector generation, and circuit proving. The output of circuit proving is passed as an input +for the next "job" in the pipeline. + +For each aggregation level mentioned above the steps are the same, though the inputs and outputs are different. + +```mermaid +sequenceDiagram +participant Ob as Prover GCS +participant DB as Prover DB +participant WG as Witness Generator +participant WVG as Witness Vector Generator +participant P as Prover +WG-->>DB: Get WG job +DB->>WG: Job +WG-->>Ob: Get job data +Ob->>WG: Data for witness generation +WG->>WG: Build witness +WG->>Ob: Save witness +WG->>DB: Create prover job +WVG-->>DB: Get prover job +DB->>WVG: Prover job +WVG->>WVG: Build witness vector +WVG-->>DB: Lock a free prover +DB->>WVG: Prover address +WVG->>P: Submit witness vector over TCP +P->>P: Generate a proof +P->>Ob: Store proof +P->>DB: Mark proof as stored +``` + +## Circuits + +Finally, even within the same level, there may be different circuit types. Under the hood, they prove the correctness of +different parts of computations. From a purely applied point of view, it mostly means that initially we receive X jobs +of N types, which cause Y jobs of M types, and so on. + +So, in addition to the aggregation layer, we also have a circuit ID. A tuple of aggregation round and circuit ID form an +unique job identifier, which allows us to understand which inputs we should receive, what processing logic we should +run, and which outputs we should produce. + +As of Jul 2024, we have 35 circuit types mapped to 5 aggregation layers. + +_Note:_ specifics of each circuit type and aggregation layers are out of scope for this document, but you can find more +information on that in the [further reading](99_further_reading.md) section. + +## Prover groups + +The next problem you would meet once you start proving in production environment is that different +`(aggregation_round, circuit_id)` pairs have different load. For some, you need a lot of machines, while for some a few +is enough. + +To help with that, we spread the machines into 15 different groups, based on how "busy" they are, and configure each +group to work with a specific set of `(aggregation_round, circuit_id)` pairs only. + +Here you can see +[an example mapping](https://github.com/matter-labs/zksync-era/blob/3fbbee10be99e8c5a696bfd50d81230141bccbf4/etc/env/base/fri_prover_group.toml). + +Whenever you launch a witness generator, witness vector generator, or prover, it will check the group it belongs to, and +will only work with pairs configured for that group. + +If a non-existent group is chosen, all of the pairs will be processed by default. + +## Regions + +Since the number of jobs is high, a cluster in a single region may not have enough machines to process them in a timely +manner. Because of that, our prover infrastructure is designed to work across multiple clusters in different GCP +regions. + +It mostly doesn't affect the code, since we use Postgres and GCS for communication, with one major exception: since WVG +streams data directly to GPU provers via TCP, it will only look for prover machines that are registered in the same zone +as WVG in order to reduce network transfers (inter-AZ costs less than intra-AZ or even cross DC). + +## Protocol versions + +Finally, ZKsync has protocol versions, and it has upgrades from time to time. Each protocol version upgrade is defined +on L1, and the version follows SemVer convention, e.g. each version is defined as `0.x.y`. During the protocol version +upgrade, one of three things can change: + +- Protocol _behavior_. For example, we add new functionality and our VM starts working differently. +- Circuits _implementation_. For example, VM behavior doesn't change, but we add more constraints to the circuits. +- Contracts changes. For example, we add a new method to the contract, which doesn't affect neither VM or circuits. + +For the first two cases, there will be changes in circuits, and there will be new verification keys. It means, that the +proving process will be different. The latter has no implications for L2 behavior. + +As a result, after upgrade, we may need to generate different proofs. But given that upgrades happen asynchronously, we +cannot guarantee that all the "old" batched will be proven at the time of upgrade. + +Because of that, prover is protocol version aware. Each binary that participates in proving is designed to only generate +proofs for a single protocol version. Once the upgrade happens, "old" provers continue working on the "old" unproven +batches, and simultaneously we start spawning "new" provers for the batches generated with the new protocol version. +Once all the "old" batches are proven, no "old" provers will be spawned anymore. + +## Recap + +That's a quite sophisticated infrastructure, and it may be hard to understand it in one go. Here's a quick recap of this +page: + +- Main components of the prover subsystem are house keeper, prover gateway, witness generator, witness vector generator, + GPU prover, and proof compressor. +- House keeper and prover gateway don't perform any significant computations, and there is just one instance of each. +- Witness generator, witness vector generator, and GPU prover work together as a "sub-pipeline". +- As of Jul 2024, the pipeline consists of 5 aggregation rounds, which are further split into 35 + `(aggregation_round, circuit_id)` pairs, followed by the proof compression. +- On the infrastructure level, these 35 pairs are spread across 15 different prover groups, according to how "busy" the + group is. +- Groups may exist in different clusters in different GCP regions. +- Provers are versioned according to the L1 protocol version. There may be provers with different versions running at + the same time. diff --git a/prover/docs/99_further_reading.md b/prover/docs/99_further_reading.md new file mode 100644 index 000000000000..64487a715d57 --- /dev/null +++ b/prover/docs/99_further_reading.md @@ -0,0 +1,13 @@ +# Further reading + +The documentation in this section aimed to provide a practical overview of the prover workspace, e.g. help people to +understand how to run provers and what they do. + +However, we have some documentation that is more focused on theory of proving in the [core workspace docs](../../docs/). + +You may find the following articles helpful for general understanding of ZK proofs: + +- [ZK intuition](../../docs/guides/advanced/13_zk_intuition.md). +- [ZK deeper overview](../../docs/guides/advanced/14_zk_deeper_overview.md). +- [Prover keys](../../docs/guides/advanced/15_prover_keys.md). +- [Overview of our ZK proving system implementation](../../docs/specs/prover/). diff --git a/prover/docs/README.md b/prover/docs/README.md new file mode 100644 index 000000000000..62f3fc8d1c4c --- /dev/null +++ b/prover/docs/README.md @@ -0,0 +1,16 @@ +# Prover subsystem documentation + +This is technical documentation for the prover subsystem.It aims to help developers to set up a development environment +for working with provers. This documentation assumes that you are already familiar with how ZKsync works, and you need +to be able to work with the prover code. + +It does not cover topics such as basics of ZK or production deployment for provers. + +## Table of contents + +- [Intro](00_intro.md) +- [Setting up a GCP VM](01_gcp_vm.md) +- [Workspace setup](02_setup.md) +- [Running prover subsystem](03_launch.md) +- [Proof generation flow](04_flow.md) +- [Further reading](99_further_reading.md) diff --git a/prover/setup.sh b/prover/setup.sh deleted file mode 100755 index 2d546c1f8bd6..000000000000 --- a/prover/setup.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -# This script sets up the necessary data needed by the CPU/GPU FRI prover to be used locally. - -GPU_FLAG="" -GENERATE_SK_COMMAND="generate-sk" -if [ "$1" = "gpu" ]; then - GPU_FLAG='--features gpu' - GENERATE_SK_COMMAND="generate-sk-gpu" -fi - -if [[ -z "${ZKSYNC_HOME}" ]]; then - echo "Environment variable ZKSYNC_HOME is not set. Make sure it's set and pointing to the root of this repository" - exit 1 -fi - -sed -i.backup 's/^proof_sending_mode=.*$/proof_sending_mode="OnlyRealProofs"/' ../etc/env/base/eth_sender.toml -rm ../etc/env/base/eth_sender.toml.backup -sed -i.backup 's/^setup_data_path=.*$/setup_data_path="vk_setup_data_generator_server_fri\/data\/"/' ../etc/env/base/fri_prover.toml -rm ../etc/env/base/fri_prover.toml.backup -sed -i.backup 's/^universal_setup_path=.*$/universal_setup_path="..\/keys\/setup\/setup_2^26.key"/' ../etc/env/base/fri_proof_compressor.toml -rm ../etc/env/base/fri_proof_compressor.toml.backup - -zk config compile dev - -# Update setup keys (only if they are not present) -zk f cargo run $GPU_FLAG --release --bin key_generator -- $GENERATE_SK_COMMAND all --recompute-if-missing