test(vm): Refactor VM benchmarks (#2668)

## What ❔ - Integrates Prometheus metrics into criterion benches; removes the DIY benchmark correspondingly. - Merges the main benchmark crate with the harness one. - Includes benched bytecodes into the crate itself rather than reading them in runtime. ## Why ❔ Makes VM benchmarks more maintainable. ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Tests for the changes have been added / updated. - [x] Documentation comments have been added / updated. - [x] Code has been formatted via `zk fmt` and `zk lint`.
matter-labs · Aug 27, 2024 · bd2b5d8 · bd2b5d8
1 parent a4170e9
commit bd2b5d8
Show file tree

Hide file tree

Showing 36 changed files with 988 additions and 645 deletions.
diff --git a/.github/workflows/ci-core-reusable.yml b/.github/workflows/ci-core-reusable.yml
@@ -67,7 +67,7 @@ jobs:
           ci_run zk test rust
           # Benchmarks are not tested by `cargo nextest` unless specified explicitly, and even then `criterion` harness is incompatible
           # with how `cargo nextest` runs tests. Thus, we run criterion-based benchmark tests manually.
-          ci_run zk f cargo test --release -p vm-benchmark --bench criterion --bench fill_bootloader
+          ci_run zk f cargo test --release -p vm-benchmark --bench oneshot --bench batch
 
   loadtest:
     runs-on: [matterlabs-ci-runner]

diff --git a/.github/workflows/vm-perf-comparison.yml b/.github/workflows/vm-perf-comparison.yml
@@ -1,4 +1,4 @@
-name: Compare VM perfomance to base branch
+name: Compare VM performance to base branch
 
 on:
   pull_request:
@@ -47,7 +47,7 @@ jobs:
           ci_run zk
           ci_run zk compiler system-contracts
           ci_run cargo bench --package vm-benchmark --bench iai | tee base-iai
-          ci_run cargo run --package vm-benchmark --release --bin instruction-counts | tee base-opcodes || touch base-opcodes
+          ci_run cargo run --package vm-benchmark --release --bin instruction_counts | tee base-opcodes || touch base-opcodes
           ci_run yarn workspace system-contracts clean
 
       - name: checkout PR
@@ -59,7 +59,7 @@ jobs:
           ci_run zk
           ci_run zk compiler system-contracts
           ci_run cargo bench --package vm-benchmark --bench iai | tee pr-iai
-          ci_run cargo run --package vm-benchmark --release --bin instruction-counts | tee pr-opcodes || touch pr-opcodes
+          ci_run cargo run --package vm-benchmark --release --bin instruction_counts | tee pr-opcodes || touch pr-opcodes
 
           EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64)
           echo "speedup<<$EOF" >> $GITHUB_OUTPUT

diff --git a/.github/workflows/vm-perf-to-prometheus.yml b/.github/workflows/vm-perf-to-prometheus.yml
@@ -21,7 +21,7 @@ jobs:
 
       - name: setup-env
         run: |
-          echo PUSH_VM_BENCHMARKS_TO_PROMETHEUS=1 >> .env
+          echo BENCHMARK_PROMETHEUS_PUSHGATEWAY_URL=${{ secrets.BENCHMARK_PROMETHEUS_PUSHGATEWAY_URL }} >> .env
 
           echo ZKSYNC_HOME=$(pwd) >> $GITHUB_ENV
           echo $(pwd)/bin >> $GITHUB_PATH
@@ -31,10 +31,12 @@ jobs:
           run_retried docker compose pull zk
           docker compose up -d zk
           ci_run zk
-          ci_run zk compiler system-contracts
+          ci_run zk compiler all
 
       - name: run benchmarks
         run: |
-          ci_run cargo bench --package vm-benchmark --bench diy_benchmark
+          ci_run cargo bench --package vm-benchmark --bench oneshot
+          # Run only benches with 1,000 transactions per batch to not spend too much time
+          ci_run cargo bench --package vm-benchmark --bench batch '/1000$'
           ci_run cargo bench --package vm-benchmark --bench iai | tee iai-result
           ci_run cargo run --package vm-benchmark --bin iai_results_to_prometheus --release < iai-result
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -79,7 +79,6 @@ members = [
     "core/tests/test_account",
     "core/tests/loadnext",
     "core/tests/vm-benchmark",
-    "core/tests/vm-benchmark/harness",
     # Parts of prover workspace that are needed for Core workspace
     "prover/crates/lib/prover_dal",
 ]
@@ -238,7 +237,6 @@ zksync_prover_dal = { version = "0.1.0", path = "prover/crates/lib/prover_dal" }
 zksync_vlog = { version = "0.1.0", path = "core/lib/vlog" }
 zksync_vm_interface = { version = "0.1.0", path = "core/lib/vm_interface" }
 zksync_vm_utils = { version = "0.1.0", path = "core/lib/vm_utils" }
-zksync_vm_benchmark_harness = { version = "0.1.0", path = "core/tests/vm-benchmark/harness" }
 zksync_basic_types = { version = "0.1.0", path = "core/lib/basic_types" }
 zksync_circuit_breaker = { version = "0.1.0", path = "core/lib/circuit_breaker" }
 zksync_config = { version = "0.1.0", path = "core/lib/config" }

diff --git a/core/tests/vm-benchmark/Cargo.toml b/core/tests/vm-benchmark/Cargo.toml
@@ -6,46 +6,30 @@ license.workspace = true
 publish = false
 
 [dependencies]
+zksync_contracts.workspace = true
+zksync_multivm.workspace = true
 zksync_types.workspace = true
+zksync_utils.workspace = true
 zksync_vlog.workspace = true
-zksync_vm_benchmark_harness.workspace = true
 
+criterion.workspace = true
+once_cell.workspace = true
 rand.workspace = true
 vise.workspace = true
 tokio.workspace = true
 
 [dev-dependencies]
-criterion.workspace = true
+assert_matches.workspace = true
 iai.workspace = true
 
 [[bench]]
-name = "criterion"
+name = "oneshot"
 harness = false
 
 [[bench]]
-name = "diy_benchmark"
+name = "batch"
 harness = false
 
 [[bench]]
 name = "iai"
 harness = false
-
-[[bench]]
-name = "fill_bootloader"
-harness = false
-
-[[bin]]
-name = "iai_results_to_prometheus"
-path = "src/iai_results_to_prometheus.rs"
-
-[[bin]]
-name = "compare_iai_results"
-path = "src/compare_iai_results.rs"
-
-[[bin]]
-name = "find-slowest"
-path = "src/find_slowest.rs"
-
-[[bin]]
-name = "instruction-counts"
-path = "src/instruction_counts.rs"
diff --git a/core/tests/vm-benchmark/README.md b/core/tests/vm-benchmark/README.md
@@ -9,35 +9,22 @@ benchmarks, however.
 There are three different benchmarking tools available:
 
 ```sh
-cargo bench --bench criterion
-cargo bench --bench diy_benchmark
+cargo bench --bench oneshot
+cargo bench --bench batch
 cargo +nightly bench --bench iai
 ```
 
-Criterion is the de-facto microbenchmarking tool for Rust. Run it, then optimize something and run the command again to
-see if your changes have made a difference.
+`oneshot` and `batch` targets use Criterion, the de-facto standard micro-benchmarking tool for Rust. `oneshot` measures
+VM performance on single transactions, and `batch` on entire batches of up to 5,000 transactions. Run these benches,
+then optimize something and run the command again to see if your changes have made a difference.
 
-The DIY benchmark works a bit better in noisy environments and is used to push benchmark data to Prometheus
-automatically.
+IAI uses cachegrind to simulate the CPU, so noise is completely irrelevant to it, but it also doesn't measure exactly
+the same thing as normal benchmarks. You need valgrind to be able to run it.
 
-IAI uses cachegrind to simulate the CPU, so noise is completely irrelevant to it but it also doesn't measure exactly the
-same thing as normal benchmarks. You need valgrind to be able to run it.
-
-You can add your own bytecodes to be benchmarked into the folder "deployment_benchmarks". For iai, you also need to add
-them to "benches/iai.rs".
+You can add new bytecodes to be benchmarked into the [`bytecodes`](src/bytecodes) directory and then add them to the
+`BYTECODES` constant exported by the crate.
 
 ## Profiling (Linux only)
 
 You can also use `sh perf.sh bytecode_file` to produce data that can be fed into the
 [firefox profiler](https://profiler.firefox.com/) for a specific bytecode.
-
-## Fuzzing
-
-There is a fuzzer using this library at core/lib/vm/fuzz. The fuzz.sh script located there starts a fuzzer which
-attempts to make cover as much code as it can to ultimately produce a valid deployment bytecode.
-
-It has no chance of succeeding currently because the fuzzing speed drops to 10 executions/s easily. Optimizing the VM or
-lowering the gas limit will help with that.
-
-The fuzzer has been useful for producing synthetic benchmark inputs. It may be a good tool for finding show transactions
-with a certain gas limit, an empirical way of evaluating gas prices of instructions.
diff --git a/...s/vm-benchmark/benches/fill_bootloader.rs → core/tests/vm-benchmark/benches/batch.rs b/...s/vm-benchmark/benches/fill_bootloader.rs → core/tests/vm-benchmark/benches/batch.rs
@@ -14,17 +14,15 @@
 
 use std::{iter, time::Duration};
 
-use criterion::{
-    black_box, criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup,
-    BenchmarkId, Criterion, Throughput,
-};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use zksync_types::Transaction;
-use zksync_vm_benchmark_harness::{
-    cut_to_allowed_bytecode_size, get_deploy_tx_with_gas_limit, get_heavy_load_test_tx,
-    get_load_test_deploy_tx, get_load_test_tx, get_realistic_load_test_tx, get_transfer_tx,
-    BenchmarkingVm, BenchmarkingVmFactory, Fast, Legacy, LoadTestParams,
+use vm_benchmark::{
+    criterion::{is_test_mode, BenchmarkGroup, BenchmarkId, CriterionExt, MeteredTime},
+    get_deploy_tx_with_gas_limit, get_heavy_load_test_tx, get_load_test_deploy_tx,
+    get_load_test_tx, get_realistic_load_test_tx, get_transfer_tx, BenchmarkingVm,
+    BenchmarkingVmFactory, Bytecode, Fast, Legacy, LoadTestParams,
 };
+use zksync_types::Transaction;
 
 /// Gas limit for deployment transactions.
 const DEPLOY_GAS_LIMIT: u32 = 30_000_000;
@@ -59,7 +57,7 @@ fn bench_vm<VM: BenchmarkingVmFactory, const FULL: bool>(
 }
 
 fn run_vm_expecting_failures<VM: BenchmarkingVmFactory, const FULL: bool>(
-    group: &mut BenchmarkGroup<'_, WallTime>,
+    group: &mut BenchmarkGroup<'_>,
     name: &str,
     txs: &[Transaction],
     expected_failures: &[bool],
@@ -70,48 +68,48 @@ fn run_vm_expecting_failures<VM: BenchmarkingVmFactory, const FULL: bool>(
         }
 
         group.throughput(Throughput::Elements(*txs_in_batch as u64));
-        group.bench_with_input(
+        group.bench_metered_with_input(
             BenchmarkId::new(name, txs_in_batch),
             txs_in_batch,
             |bencher, &txs_in_batch| {
                 if FULL {
                     // Include VM initialization / drop into the measured time
-                    bencher.iter(|| {
+                    bencher.iter(|timer| {
+                        let _guard = timer.start();
                         let mut vm = BenchmarkingVm::<VM>::default();
                         bench_vm::<_, true>(&mut vm, &txs[..txs_in_batch], expected_failures);
                     });
                 } else {
-                    bencher.iter_batched(
-                        BenchmarkingVm::<VM>::default,
-                        |mut vm| {
-                            bench_vm::<_, false>(&mut vm, &txs[..txs_in_batch], expected_failures);
-                            vm
-                        },
-                        BatchSize::LargeInput, // VM can consume significant amount of RAM, especially the new one
-                    );
+                    bencher.iter(|timer| {
+                        let mut vm = BenchmarkingVm::<VM>::default();
+                        let guard = timer.start();
+                        bench_vm::<_, false>(&mut vm, &txs[..txs_in_batch], expected_failures);
+                        drop(guard);
+                    });
                 }
             },
         );
     }
 }
 
 fn run_vm<VM: BenchmarkingVmFactory, const FULL: bool>(
-    group: &mut BenchmarkGroup<'_, WallTime>,
+    group: &mut BenchmarkGroup<'_>,
     name: &str,
     txs: &[Transaction],
 ) {
     run_vm_expecting_failures::<VM, FULL>(group, name, txs, &[]);
 }
 
-fn bench_fill_bootloader<VM: BenchmarkingVmFactory, const FULL: bool>(c: &mut Criterion) {
-    let is_test_mode = !std::env::args().any(|arg| arg == "--bench");
-    let txs_in_batch = if is_test_mode {
+fn bench_fill_bootloader<VM: BenchmarkingVmFactory, const FULL: bool>(
+    c: &mut Criterion<MeteredTime>,
+) {
+    let txs_in_batch = if is_test_mode() {
         &TXS_IN_BATCH[..3] // Reduce the number of transactions in a batch so that tests don't take long
     } else {
         TXS_IN_BATCH
     };
 
-    let mut group = c.benchmark_group(if FULL {
+    let mut group = c.metered_group(if FULL {
         format!("fill_bootloader_full{}", VM::LABEL.as_suffix())
     } else {
         format!("fill_bootloader{}", VM::LABEL.as_suffix())
@@ -121,12 +119,12 @@ fn bench_fill_bootloader<VM: BenchmarkingVmFactory, const FULL: bool>(c: &mut Cr
         .measurement_time(Duration::from_secs(10));
 
     // Deploying simple contract
-    let test_contract =
-        std::fs::read("deployment_benchmarks/deploy_simple_contract").expect("failed to read file");
-    let code = cut_to_allowed_bytecode_size(&test_contract).unwrap();
+    let test_contract = Bytecode::get("deploy_simple_contract");
     let max_txs = *txs_in_batch.last().unwrap() as u32;
     let txs: Vec<_> = (0..max_txs)
-        .map(|nonce| get_deploy_tx_with_gas_limit(code, DEPLOY_GAS_LIMIT, nonce))
+        .map(|nonce| {
+            get_deploy_tx_with_gas_limit(test_contract.bytecode(), DEPLOY_GAS_LIMIT, nonce)
+        })
         .collect();
     run_vm::<VM, FULL>(&mut group, "deploy_simple_contract", &txs);
     drop(txs);
@@ -187,9 +185,12 @@ fn bench_fill_bootloader<VM: BenchmarkingVmFactory, const FULL: bool>(c: &mut Cr
 }
 
 criterion_group!(
-    benches,
-    bench_fill_bootloader::<Fast, false>,
-    bench_fill_bootloader::<Fast, true>,
-    bench_fill_bootloader::<Legacy, false>
+    name = benches;
+    config = Criterion::default()
+        .configure_from_args()
+        .with_measurement(MeteredTime::new("fill_bootloader"));
+    targets = bench_fill_bootloader::<Fast, false>,
+        bench_fill_bootloader::<Fast, true>,
+        bench_fill_bootloader::<Legacy, false>
 );
 criterion_main!(benches);