Skip to content

Commit

Permalink
feat: Parallel IPA (#3882)
Browse files Browse the repository at this point in the history
This PR:
1. Adds a new function run_loop_in_parallel to thread.hpp so that it's
easier to use parallelism in most cases without redoing the computation
for splitting the workload into chunks every time.
2. Uses this new functionality to parallelise logic in IPA open and
verify procedures (and used methods) (x20-x30 improvement for ECCVM
proving)
3. Fixes an error in using one of the vectors in IPA opening procedure
which led to an additional nlogn complexity.
4. Adds an IPA opening and verification benchmark
  • Loading branch information
Rumata888 authored Jan 11, 2024
1 parent 9355cda commit 7002a33
Show file tree
Hide file tree
Showing 8 changed files with 419 additions and 144 deletions.
1 change: 1 addition & 0 deletions barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(ipa_bench)
add_subdirectory(decrypt_bench)
add_subdirectory(pippenger_bench)
add_subdirectory(plonk_bench)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Each source represents a separate benchmark suite
set(BENCHMARK_SOURCES
ipa.bench.cpp
)

# Required libraries for benchmark suites
set(LINKED_LIBRARIES
benchmark::benchmark
ultra_honk
)

# Add executable and custom target for each suite, e.g. ultra_honk_bench
foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
endforeach()
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
#include <benchmark/benchmark.h>

using namespace benchmark;
using namespace barretenberg;
using namespace proof_system;
using namespace proof_system::honk::pcs::ipa;
namespace {
using Curve = curve::Grumpkin;
using Fr = Curve::ScalarField;
using IPA = IPA<Curve>;
using OpeningPair = honk::pcs::OpeningPair<Curve>;
using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
using Polynomial = Polynomial<Curve::ScalarField>;
using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;

constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));

auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);

std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);

void ipa_open(State& state) noexcept
{
numeric::random::Engine& engine = numeric::random::get_debug_engine();
for (auto _ : state) {
state.PauseTiming();
size_t n = 1 << static_cast<size_t>(state.range(0));
// Construct the polynomial
Polynomial poly(n);
for (size_t i = 0; i < n; ++i) {
poly[i] = Fr::random_element(&engine);
}
auto x = Fr::random_element(&engine);
auto eval = poly.evaluate(x);
const OpeningPair opening_pair = { x, eval };
const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
// initialize empty prover transcript
auto prover_transcript = std::make_shared<honk::BaseTranscript>();
state.ResumeTiming();
// Compute proof
IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
// Store info for verifier
prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
}
}
void ipa_verify(State& state) noexcept
{
for (auto _ : state) {
state.PauseTiming();
// Retrieve proofs
auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
// initialize verifier transcript from proof data
auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);

state.ResumeTiming();
auto result = IPA::verify(vk, opening_claim, verifier_transcript);
ASSERT(result);
}
}
} // namespace
BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
BENCHMARK_MAIN();
143 changes: 93 additions & 50 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,46 +52,67 @@ template <typename Curve> class IPA {
auto a_vec = polynomial;
auto srs_elements = ck->srs->get_monomial_points();
std::vector<Commitment> G_vec_local(poly_degree);

// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
for (size_t i = 0; i < poly_degree * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
run_loop_in_parallel(
poly_degree,
[&G_vec_local, srs_elements](size_t start, size_t end) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*no_multhreading_if_less_or_equal=*/16);

std::vector<Fr> b_vec(poly_degree);
Fr b_power = 1;
for (size_t i = 0; i < poly_degree; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
run_loop_in_parallel(
poly_degree,
[&b_vec, &opening_pair](size_t start, size_t end) {
Fr b_power = opening_pair.challenge.pow(start);
for (size_t i = start; i < end; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
},
/*no_multhreading_if_less_or_equal=*/16);

// Iterate for log(poly_degree) rounds to compute the round commitments.
auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
std::vector<GroupElement> L_elements(log_poly_degree);
std::vector<GroupElement> R_elements(log_poly_degree);
std::size_t round_size = poly_degree;

// TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
// work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
// improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
// loop can also be integrated.
// Perform IPA rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
std::mutex addition_lock;
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
for (size_t j = 0; j < round_size; j++) {
inner_prod_L += a_vec[j] * b_vec[round_size + j];
inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
// Run scalar product in parallel
run_loop_in_parallel(
round_size,
[&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
addition_lock.lock();
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
addition_lock.unlock();
},
/*no_multhreading_if_less_or_equal=*/8);

// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_elements[i] =
// TODO(#473)
barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
L_elements[i] += aux_generator * inner_prod_L;

// R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
// TODO(#473)
R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
R_elements[i] += aux_generator * inner_prod_R;
Expand All @@ -104,23 +125,32 @@ template <typename Curve> class IPA {
const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
const Fr round_challenge_inv = round_challenge.invert();

std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
auto G_lo = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
round_challenge_inv);
auto G_hi = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin() + static_cast<long>(round_size),
G_vec_local.begin() + static_cast<long>(round_size * 2) },
round_challenge);

// Update the vectors a_vec, b_vec and G_vec.
// a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
// b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
// G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
for (size_t j = 0; j < round_size; j++) {
a_vec[j] *= round_challenge;
a_vec[j] += round_challenge_inv * a_vec[round_size + j];
b_vec[j] *= round_challenge_inv;
b_vec[j] += round_challenge * b_vec[round_size + j];

G_vec_local[j] = G_lo[j] + G_hi[j];
}
run_loop_in_parallel(
round_size,
[&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
a_vec[j] *= round_challenge;
a_vec[j] += round_challenge_inv * a_vec[round_size + j];
b_vec[j] *= round_challenge_inv;
b_vec[j] += round_challenge * b_vec[round_size + j];

G_vec_local[j] = G_lo[j] + G_hi[j];
}
},
/*no_multhreading_if_less_or_equal=*/4);
}

transcript->send_to_verifier("IPA:a_0", a_vec[0]);
Expand Down Expand Up @@ -166,7 +196,7 @@ template <typename Curve> class IPA {
msm_scalars[2 * i] = round_challenges[i].sqr();
msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
}
// TODO(#473)

GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
GroupElement C_zero = C_prime + LR_sums;
Expand All @@ -188,29 +218,42 @@ template <typename Curve> class IPA {
// Compute G_zero
// First construct s_vec
std::vector<Fr> s_vec(poly_degree);
for (size_t i = 0; i < poly_degree; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
} else {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
run_loop_in_parallel(
poly_degree,
[&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
} else {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
}
s_vec[i] = s_vec_scalar;
}
}
s_vec[i] = s_vec_scalar;
}
},
/*no_multhreading_if_less_or_equal=*/4);

auto srs_elements = vk->srs->get_monomial_points();

// Copy the G_vector to local memory.
std::vector<Commitment> G_vec_local(poly_degree);

// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
for (size_t i = 0; i < poly_degree * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
// TODO(#473)
run_loop_in_parallel(
poly_degree,
[&G_vec_local, srs_elements](size_t start, size_t end) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*no_multhreading_if_less_or_equal=*/16);

auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
&s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);

Expand Down
41 changes: 41 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
#endif
#endif
}

/**
* @brief Split a loop into several loops running in parallel
*
* @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
* that should contain the work loop
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
*
*/
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
{
if (num_points <= no_multhreading_if_less_or_equal) {
func(0, num_points);
return;
}
// Get number of cpus we can split into
const size_t num_cpus = get_num_cpus();

// Compute the size of a single chunk
const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
// Parallelize over chunks
parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
// If num_points is small, sometimes we need fewer CPUs
if (chunk_size * chunk_index > num_points) {
return;
}
// Compute the current chunk size (can differ in case it's the last chunk)
size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
if (current_chunk_size == 0) {
return;
}
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;
func(start, end);
});
};
3 changes: 3 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
}

void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal = 0);
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {

static void batch_normalize(element* elements, size_t num_elements) noexcept;
static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;

Fq x;
Fq y;
Expand Down
Loading

0 comments on commit 7002a33

Please sign in to comment.