Skip to content

Commit

Permalink
fix: honk sumcheck performance (#2925)
Browse files Browse the repository at this point in the history
Parallelizing sumcheck partially_evaluate
Before

![image](https://github.com/AztecProtocol/aztec-packages/assets/163993/82c4038f-84c6-41f6-a91e-4b8a9d528cad)
After

![image](https://github.com/AztecProtocol/aztec-packages/assets/163993/9cdb0dd1-9369-403d-82e6-a3a3e5b21857)
(note, when we see parallel_for_mutex_pool in the flamegraph it's some
parallel_for operation but the details are aggregated in different
threads)

---------

Co-authored-by: ludamad <[email protected]>
Co-authored-by: codygunton <[email protected]>
  • Loading branch information
3 people authored Oct 19, 2023
1 parent 7042bc6 commit 5fbfe6e
Show file tree
Hide file tree
Showing 13 changed files with 86 additions and 56 deletions.
49 changes: 30 additions & 19 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -183,34 +183,40 @@
"MULTITHREADING": "ON"
}
},
{
"name": "xray-1thread",
"displayName": "Build with single-threaded XRay Profiling",
"description": "Build with Clang and enable single-threaded LLVM XRay for profiling",
"generator": "Unix Makefiles",
"inherits": "clang16",
"environment": {
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=10",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=10",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=10"
},
"cacheVariables": {
"MULTITHREADING": "OFF"
},
"binaryDir": "build-xray-1thread"
},
{
"name": "xray",
"displayName": "Build with multi-threaded XRay Profiling",
"description": "Build with Clang and enable multi-threaded LLVM XRay for profiling",
"generator": "Unix Makefiles",
"inherits": "clang16",
"environment": {
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=10",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=10",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=10"
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100"
},
"binaryDir": "build-xray"
},
{
"name": "xray-verbose",
"displayName": "Build with detailed XRay Profiling",
"description": "Build with Clang and enable detailed LLVM XRay for profiling",
"inherits": "xray",
"environment": {
"CFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150",
"CXXFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150",
"LDFLAGS": "-fxray-instrument -fxray-instruction-threshold=100 -finline-max-stacksize=150"
},
"binaryDir": "build-xray-verbose"
},
{
"name": "xray-1thread",
"displayName": "Build with single-threaded XRay Profiling",
"description": "Build with Clang and enable single-threaded LLVM XRay for profiling",
"inherits": "xray",
"cacheVariables": {
"MULTITHREADING": "OFF"
},
"binaryDir": "build-xray-1thread"
}
],
"buildPresets": [
Expand Down Expand Up @@ -303,6 +309,11 @@
"jobs": 0,
"targets": ["barretenberg.wasm"]
},
{
"name": "xray-verbose",
"configurePreset": "xray-verbose",
"inherits": "default"
},
{
"name": "xray-1thread",
"configurePreset": "xray-1thread",
Expand Down
3 changes: 2 additions & 1 deletion barretenberg/cpp/scripts/collect_profile_information.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,6 @@ llvm-xray-16 stack xray-log.honk_bench_main_simple.* \
--instr_map=./bin/honk_bench_main_simple --stack-format=flame --aggregate-threads --aggregation-type=time --all-stacks \
| node ../scripts/llvm_xray_stack_flame_corrector.js \
| shorten_cpp_names \
| ../scripts/flamegraph.pl > xray.svg
| ../scripts/flamegraph.pl --width 1200 --fontsize 10 \
> xray.svg
echo "Profiling complete, now you can do e.g. 'scp mainframe:`readlink -f xray.svg` .' on a local terminal and open the SVG in a browser."
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ BASELINE_BRANCH="master"

echo -e "\nComparing $BENCH_TARGET between $BASELINE_BRANCH and current branch:"
# Set some directories
BASE_DIR="$HOME/barretenberg/cpp"
BASE_DIR="$HOME/aztec-packages/barretenberg/cpp"
BUILD_DIR="$BASE_DIR/build-bench" # matches build dir specified in bench preset
BENCH_RESULTS_DIR="$BASE_DIR/tmp_bench_results"
BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

echo -e '\nComparing Ultra Plonk/Honk benchmarks.'
# Set some directories
BASE_DIR="$HOME/barretenberg/cpp"
BASE_DIR="$HOME/aztec-packages/barretenberg/cpp"
BUILD_DIR="$BASE_DIR/build-bench"
BENCH_RESULTS_DIR="$BASE_DIR/tmp_bench_results"
BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,24 @@
#include "barretenberg/stdlib/primitives/packed_byte_array/packed_byte_array.hpp"
#include "barretenberg/stdlib/primitives/witness/witness.hpp"

using namespace proof_system::plonk;

using UltraBuilder = proof_system::UltraCircuitBuilder;
using UltraHonk = proof_system::honk::UltraComposer;
using namespace proof_system;

template <typename Builder> void generate_sha256_test_circuit(Builder& builder, size_t num_iterations)
{
std::string in;
in.resize(32);
proof_system::plonk::stdlib::packed_byte_array<Builder> input(&builder, in);
plonk::stdlib::packed_byte_array<Builder> input(&builder, in);
for (size_t i = 0; i < num_iterations; i++) {
input = proof_system::plonk::stdlib::sha256<Builder>(input);
input = plonk::stdlib::sha256<Builder>(input);
}
}

BBERG_INSTRUMENT BBERG_NOINLINE void sumcheck_profiling(honk::UltraProver& ext_prover)
{
ext_prover.construct_proof();
for (size_t i = 0; i < 200; i++) {
// Bench sumcheck
ext_prover.execute_relation_check_rounds();
}
}

Expand All @@ -44,15 +50,14 @@ void construct_proof_ultra() noexcept
{
barretenberg::srs::init_crs_factory("../srs_db/ignition");
// Constuct circuit and prover; don't include this part in measurement
auto builder = typename UltraHonk::CircuitBuilder();
generate_sha256_test_circuit<UltraBuilder>(builder, 1);
honk::UltraComposer::CircuitBuilder builder;
generate_sha256_test_circuit(builder, 1);
std::cout << "gates: " << builder.get_total_circuit_size() << std::endl;

auto composer = UltraHonk();
auto instance = composer.create_instance(builder);
auto ext_prover = composer.create_prover(instance);
for (size_t i = 0; i < 10; i++) {
auto proof = ext_prover.construct_proof();
}
honk::UltraComposer composer;
std::shared_ptr<honk::UltraComposer::Instance> instance = composer.create_instance(builder);
honk::UltraProver ext_prover = composer.create_prover(instance);
sumcheck_profiling(ext_prover);
}

int main()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,22 @@ void construct_proof_ultra(State& state, void (*test_circuit_function)(UltraBuil
BENCHMARK_CAPTURE(construct_proof_ultra, sha256, &bench_utils::generate_sha256_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra, keccak, &bench_utils::generate_keccak_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra,
ecdsa_verification,
&bench_utils::generate_ecdsa_verification_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra,
merkle_membership,
&bench_utils::generate_merkle_membership_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);

} // namespace ultra_honk_bench
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,22 @@ void construct_proof_ultra(State& state, void (*test_circuit_function)(UltraBuil
BENCHMARK_CAPTURE(construct_proof_ultra, sha256, &bench_utils::generate_sha256_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra, keccak, &bench_utils::generate_keccak_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra,
ecdsa_verification,
&bench_utils::generate_ecdsa_verification_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);
BENCHMARK_CAPTURE(construct_proof_ultra,
merkle_membership,
&bench_utils::generate_merkle_membership_test_circuit<UltraBuilder>)
->DenseRange(MIN_NUM_ITERATIONS, MAX_NUM_ITERATIONS)
->Repetitions(NUM_REPETITIONS)
->Unit(::benchmark::kSecond);
->Unit(::benchmark::kMillisecond);

} // namespace ultra_plonk_bench
18 changes: 18 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#ifdef _WIN32
#define BBERG_INLINE __forceinline inline
#else
#define BBERG_INLINE __attribute__((always_inline)) inline
#endif

// TODO(AD): Other compilers
#if defined(__clang__)
#define BBERG_INSTRUMENT [[clang::xray_always_instrument]]
#define BBERG_NO_INSTRUMENT [[clang::xray_never_instrument]]
#define BBERG_NOINLINE [[clang::noinline]]
#else
#define BBERG_INSTRUMENT
#define BBERG_NO_INSTRUMENT
#define BBERG_NOINLINE
#endif
7 changes: 0 additions & 7 deletions barretenberg/cpp/src/barretenberg/common/inline.hpp

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <thread>
#include <vector>

#include "barretenberg/common/compiler_hints.hpp"

namespace {

class ThreadPool {
Expand Down Expand Up @@ -50,7 +52,7 @@ class ThreadPool {
std::condition_variable complete_condition_;
bool stop = false;

void worker_loop(size_t thread_index);
BBERG_NO_INSTRUMENT void worker_loop(size_t thread_index);

void do_iterations()
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/inline.hpp"
#include "barretenberg/common/compiler_hints.hpp"
#include "barretenberg/numeric/random/engine.hpp"
#include "barretenberg/numeric/uint128/uint128.hpp"
#include "barretenberg/numeric/uint256/uint256.hpp"
Expand Down
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "affine_element.hpp"
#include "barretenberg/common/inline.hpp"
#include "barretenberg/common/compiler_hints.hpp"
#include "barretenberg/common/mem.hpp"
#include "barretenberg/numeric/random/engine.hpp"
#include "barretenberg/numeric/uint256/uint256.hpp"
Expand Down
4 changes: 2 additions & 2 deletions barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,12 @@ template <typename Flavor> class SumcheckProver {
void partially_evaluate(auto& polynomials, size_t round_size, FF round_challenge)
{
// after the first round, operate in place on partially_evaluated_polynomials
for (size_t j = 0; j < polynomials.size(); ++j) {
parallel_for(polynomials.size(), [&](size_t j) {
for (size_t i = 0; i < round_size; i += 2) {
partially_evaluated_polynomials[j][i >> 1] =
polynomials[j][i] + round_challenge * (polynomials[j][i + 1] - polynomials[j][i]);
}
}
});
};
};

Expand Down

0 comments on commit 5fbfe6e

Please sign in to comment.