From 5fbfe6eeccdb23f734fb36f30d1e33340f9fb07a Mon Sep 17 00:00:00 2001 From: ludamad Date: Thu, 19 Oct 2023 09:31:38 -0400 Subject: [PATCH] fix: honk sumcheck performance (#2925) Parallelizing sumcheck partially_evaluate Before ![image](https://github.com/AztecProtocol/aztec-packages/assets/163993/82c4038f-84c6-41f6-a91e-4b8a9d528cad) After ![image](https://github.com/AztecProtocol/aztec-packages/assets/163993/9cdb0dd1-9369-403d-82e6-a3a3e5b21857) (note, when we see parallel_for_mutex_pool in the flamegraph it's some parallel_for operation but the details are aggregated in different threads) --------- Co-authored-by: ludamad Co-authored-by: codygunton --- barretenberg/cpp/CMakePresets.json | 49 ++++++++++++------- .../scripts/collect_profile_information.sh | 3 +- .../benchmark/compare_branch_vs_baseline.sh | 2 +- .../honk_bench/compare_honk_to_plonk_ultra.sh | 2 +- .../benchmark/honk_bench/main.simple.cpp | 33 +++++++------ .../benchmark/honk_bench/ultra_honk.bench.cpp | 8 +-- .../honk_bench/ultra_plonk.bench.cpp | 8 +-- .../barretenberg/common/compiler_hints.hpp | 18 +++++++ .../cpp/src/barretenberg/common/inline.hpp | 7 --- .../common/parallel_for_mutex_pool.cpp | 4 +- .../ecc/fields/field_declarations.hpp | 2 +- .../src/barretenberg/ecc/groups/element.hpp | 2 +- .../barretenberg/honk/sumcheck/sumcheck.hpp | 4 +- 13 files changed, 86 insertions(+), 56 deletions(-) create mode 100644 barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp delete mode 100644 barretenberg/cpp/src/barretenberg/common/inline.hpp diff --git a/barretenberg/cpp/CMakePresets.json b/barretenberg/cpp/CMakePresets.json index 6bafa8fed7a..c54e0b3419e 100644 --- a/barretenberg/cpp/CMakePresets.json +++ b/barretenberg/cpp/CMakePresets.json @@ -183,22 +183,6 @@ "MULTITHREADING": "ON" } }, - { - "name": "xray-1thread", - "displayName": "Build with single-threaded XRay Profiling", - "description": "Build with Clang and enable single-threaded LLVM XRay for profiling", - "generator": "Unix Makefiles", - "inherits": "clang16", - "environment": { - "CFLAGS": "-fxray-instrument -fxray-instruction-threshold=10", - "CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=10", - "LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=10" - }, - "cacheVariables": { - "MULTITHREADING": "OFF" - }, - "binaryDir": "build-xray-1thread" - }, { "name": "xray", "displayName": "Build with multi-threaded XRay Profiling", @@ -206,11 +190,33 @@ "generator": "Unix Makefiles", "inherits": "clang16", "environment": { - "CFLAGS": "-fxray-instrument -fxray-instruction-threshold=10", - "CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=10", - "LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=10" + "CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100", + "CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100", + "LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100" }, "binaryDir": "build-xray" + }, + { + "name": "xray-verbose", + "displayName": "Build with detailed XRay Profiling", + "description": "Build with Clang and enable detailed LLVM XRay for profiling", + "inherits": "xray", + "environment": { + "CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150", + "CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150", + "LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150" + }, + "binaryDir": "build-xray-verbose" + }, + { + "name": "xray-1thread", + "displayName": "Build with single-threaded XRay Profiling", + "description": "Build with Clang and enable single-threaded LLVM XRay for profiling", + "inherits": "xray", + "cacheVariables": { + "MULTITHREADING": "OFF" + }, + "binaryDir": "build-xray-1thread" } ], "buildPresets": [ @@ -303,6 +309,11 @@ "jobs": 0, "targets": ["barretenberg.wasm"] }, + { + "name": "xray-verbose", + "configurePreset": "xray-verbose", + "inherits": "default" + }, { "name": "xray-1thread", "configurePreset": "xray-1thread", diff --git a/barretenberg/cpp/scripts/collect_profile_information.sh b/barretenberg/cpp/scripts/collect_profile_information.sh index 62757181ac3..0b7d79ef8ed 100755 --- a/barretenberg/cpp/scripts/collect_profile_information.sh +++ b/barretenberg/cpp/scripts/collect_profile_information.sh @@ -41,5 +41,6 @@ llvm-xray-16 stack xray-log.honk_bench_main_simple.* \ --instr_map=./bin/honk_bench_main_simple --stack-format=flame --aggregate-threads --aggregation-type=time --all-stacks \ | node ../scripts/llvm_xray_stack_flame_corrector.js \ | shorten_cpp_names \ - | ../scripts/flamegraph.pl > xray.svg + | ../scripts/flamegraph.pl --width 1200 --fontsize 10 \ + > xray.svg echo "Profiling complete, now you can do e.g. 'scp mainframe:`readlink -f xray.svg` .' on a local terminal and open the SVG in a browser." diff --git a/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh b/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh index f0105bb7a34..0ac6dce1157 100755 --- a/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh +++ b/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh @@ -10,7 +10,7 @@ BASELINE_BRANCH="master" echo -e "\nComparing $BENCH_TARGET between $BASELINE_BRANCH and current branch:" # Set some directories -BASE_DIR="$HOME/barretenberg/cpp" +BASE_DIR="$HOME/aztec-packages/barretenberg/cpp" BUILD_DIR="$BASE_DIR/build-bench" # matches build dir specified in bench preset BENCH_RESULTS_DIR="$BASE_DIR/tmp_bench_results" BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools" diff --git a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_to_plonk_ultra.sh b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_to_plonk_ultra.sh index 0e5625e9309..1863327ae4e 100755 --- a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_to_plonk_ultra.sh +++ b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_to_plonk_ultra.sh @@ -6,7 +6,7 @@ echo -e '\nComparing Ultra Plonk/Honk benchmarks.' # Set some directories -BASE_DIR="$HOME/barretenberg/cpp" +BASE_DIR="$HOME/aztec-packages/barretenberg/cpp" BUILD_DIR="$BASE_DIR/build-bench" BENCH_RESULTS_DIR="$BASE_DIR/tmp_bench_results" BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools" diff --git a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/main.simple.cpp b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/main.simple.cpp index f33faf554d1..84a2f3c8c88 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/main.simple.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/main.simple.cpp @@ -22,18 +22,24 @@ #include "barretenberg/stdlib/primitives/packed_byte_array/packed_byte_array.hpp" #include "barretenberg/stdlib/primitives/witness/witness.hpp" -using namespace proof_system::plonk; - -using UltraBuilder = proof_system::UltraCircuitBuilder; -using UltraHonk = proof_system::honk::UltraComposer; +using namespace proof_system; template void generate_sha256_test_circuit(Builder& builder, size_t num_iterations) { std::string in; in.resize(32); - proof_system::plonk::stdlib::packed_byte_array input(&builder, in); + plonk::stdlib::packed_byte_array input(&builder, in); for (size_t i = 0; i < num_iterations; i++) { - input = proof_system::plonk::stdlib::sha256(input); + input = plonk::stdlib::sha256(input); + } +} + +BBERG_INSTRUMENT BBERG_NOINLINE void sumcheck_profiling(honk::UltraProver& ext_prover) +{ + ext_prover.construct_proof(); + for (size_t i = 0; i < 200; i++) { + // Bench sumcheck + ext_prover.execute_relation_check_rounds(); } } @@ -44,15 +50,14 @@ void construct_proof_ultra() noexcept { barretenberg::srs::init_crs_factory("../srs_db/ignition"); // Constuct circuit and prover; don't include this part in measurement - auto builder = typename UltraHonk::CircuitBuilder(); - generate_sha256_test_circuit(builder, 1); + honk::UltraComposer::CircuitBuilder builder; + generate_sha256_test_circuit(builder, 1); + std::cout << "gates: " << builder.get_total_circuit_size() << std::endl; - auto composer = UltraHonk(); - auto instance = composer.create_instance(builder); - auto ext_prover = composer.create_prover(instance); - for (size_t i = 0; i < 10; i++) { - auto proof = ext_prover.construct_proof(); - } + honk::UltraComposer composer; + std::shared_ptr instance = composer.create_instance(builder); + honk::UltraProver ext_prover = composer.create_prover(instance); + sumcheck_profiling(ext_prover); } int main() diff --git a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_honk.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_honk.bench.cpp index 56c45d24ef0..92933bb4648 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_honk.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_honk.bench.cpp @@ -30,22 +30,22 @@ void construct_proof_ultra(State& state, void (*test_circuit_function)(UltraBuil BENCHMARK_CAPTURE(construct_proof_ultra, sha256, &bench_utils::generate_sha256_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, keccak, &bench_utils::generate_keccak_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, ecdsa_verification, &bench_utils::generate_ecdsa_verification_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, merkle_membership, &bench_utils::generate_merkle_membership_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); } // namespace ultra_honk_bench \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_plonk.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_plonk.bench.cpp index 196245f4ea3..74a9fd1acc7 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_plonk.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/honk_bench/ultra_plonk.bench.cpp @@ -26,22 +26,22 @@ void construct_proof_ultra(State& state, void (*test_circuit_function)(UltraBuil BENCHMARK_CAPTURE(construct_proof_ultra, sha256, &bench_utils::generate_sha256_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, keccak, &bench_utils::generate_keccak_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, ecdsa_verification, &bench_utils::generate_ecdsa_verification_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); BENCHMARK_CAPTURE(construct_proof_ultra, merkle_membership, &bench_utils::generate_merkle_membership_test_circuit) ->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS) ->Repetitions(NUM_REPETITIONS) - ->Unit(::benchmark::kSecond); + ->Unit(::benchmark::kMillisecond); } // namespace ultra_plonk_bench \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp new file mode 100644 index 00000000000..c3bdf3cf6b9 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp @@ -0,0 +1,18 @@ +#pragma once + +#ifdef _WIN32 +#define BBERG_INLINE __forceinline inline +#else +#define BBERG_INLINE __attribute__((always_inline)) inline +#endif + +// TODO(AD): Other compilers +#if defined(__clang__) +#define BBERG_INSTRUMENT [[clang::xray_always_instrument]] +#define BBERG_NO_INSTRUMENT [[clang::xray_never_instrument]] +#define BBERG_NOINLINE [[clang::noinline]] +#else +#define BBERG_INSTRUMENT +#define BBERG_NO_INSTRUMENT +#define BBERG_NOINLINE +#endif \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/inline.hpp b/barretenberg/cpp/src/barretenberg/common/inline.hpp deleted file mode 100644 index ee5be8ac78e..00000000000 --- a/barretenberg/cpp/src/barretenberg/common/inline.hpp +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#ifdef _WIN32 -#define BBERG_INLINE __forceinline inline -#else -#define BBERG_INLINE __attribute__((always_inline)) inline -#endif diff --git a/barretenberg/cpp/src/barretenberg/common/parallel_for_mutex_pool.cpp b/barretenberg/cpp/src/barretenberg/common/parallel_for_mutex_pool.cpp index c8fb4ebf5ec..47e03b5ea85 100644 --- a/barretenberg/cpp/src/barretenberg/common/parallel_for_mutex_pool.cpp +++ b/barretenberg/cpp/src/barretenberg/common/parallel_for_mutex_pool.cpp @@ -8,6 +8,8 @@ #include #include +#include "barretenberg/common/compiler_hints.hpp" + namespace { class ThreadPool { @@ -50,7 +52,7 @@ class ThreadPool { std::condition_variable complete_condition_; bool stop = false; - void worker_loop(size_t thread_index); + BBERG_NO_INSTRUMENT void worker_loop(size_t thread_index); void do_iterations() { diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp index 799c202f709..c64b1e35196 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp @@ -1,6 +1,6 @@ #pragma once #include "barretenberg/common/assert.hpp" -#include "barretenberg/common/inline.hpp" +#include "barretenberg/common/compiler_hints.hpp" #include "barretenberg/numeric/random/engine.hpp" #include "barretenberg/numeric/uint128/uint128.hpp" #include "barretenberg/numeric/uint256/uint256.hpp" diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 3eba4298afb..0b2d13761f4 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -1,7 +1,7 @@ #pragma once #include "affine_element.hpp" -#include "barretenberg/common/inline.hpp" +#include "barretenberg/common/compiler_hints.hpp" #include "barretenberg/common/mem.hpp" #include "barretenberg/numeric/random/engine.hpp" #include "barretenberg/numeric/uint256/uint256.hpp" diff --git a/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck.hpp b/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck.hpp index b8348c819d2..81af019be73 100644 --- a/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck.hpp +++ b/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck.hpp @@ -139,12 +139,12 @@ template class SumcheckProver { void partially_evaluate(auto& polynomials, size_t round_size, FF round_challenge) { // after the first round, operate in place on partially_evaluated_polynomials - for (size_t j = 0; j < polynomials.size(); ++j) { + parallel_for(polynomials.size(), [&](size_t j) { for (size_t i = 0; i < round_size; i += 2) { partially_evaluated_polynomials[j][i >> 1] = polynomials[j][i] + round_challenge * (polynomials[j][i + 1] - polynomials[j][i]); } - } + }); }; };