From 7002a332da3bb9a75d5164a068a2bd9ea1ad211a Mon Sep 17 00:00:00 2001 From: Innokentii Sennovskii Date: Thu, 11 Jan 2024 17:41:28 +0000 Subject: [PATCH] feat: Parallel IPA (#3882) This PR: 1. Adds a new function run_loop_in_parallel to thread.hpp so that it's easier to use parallelism in most cases without redoing the computation for splitting the workload into chunks every time. 2. Uses this new functionality to parallelise logic in IPA open and verify procedures (and used methods) (x20-x30 improvement for ECCVM proving) 3. Fixes an error in using one of the vectors in IPA opening procedure which led to an additional nlogn complexity. 4. Adds an IPA opening and verification benchmark --- .../src/barretenberg/benchmark/CMakeLists.txt | 1 + .../benchmark/ipa_bench/CMakeLists.txt | 18 ++ .../benchmark/ipa_bench/ipa.bench.cpp | 73 +++++ .../commitment_schemes/ipa/ipa.hpp | 143 +++++---- .../cpp/src/barretenberg/common/thread.cpp | 41 +++ .../cpp/src/barretenberg/common/thread.hpp | 3 + .../src/barretenberg/ecc/groups/element.hpp | 2 +- .../barretenberg/ecc/groups/element_impl.hpp | 282 ++++++++++++------ 8 files changed, 419 insertions(+), 144 deletions(-) create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt index 49bcb251f07..4a13dff546d 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(ipa_bench) add_subdirectory(decrypt_bench) add_subdirectory(pippenger_bench) add_subdirectory(plonk_bench) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt new file mode 100644 index 00000000000..6e66ce2cf36 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt @@ -0,0 +1,18 @@ +# Each source represents a separate benchmark suite +set(BENCHMARK_SOURCES + ipa.bench.cpp +) + +# Required libraries for benchmark suites +set(LINKED_LIBRARIES + benchmark::benchmark + ultra_honk +) + +# Add executable and custom target for each suite, e.g. ultra_honk_bench +foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES}) + get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension + add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE}) + target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES}) + add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endforeach() \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp new file mode 100644 index 00000000000..9d4f7028e55 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp @@ -0,0 +1,73 @@ +#include "barretenberg/commitment_schemes/ipa/ipa.hpp" +#include + +using namespace benchmark; +using namespace barretenberg; +using namespace proof_system; +using namespace proof_system::honk::pcs::ipa; +namespace { +using Curve = curve::Grumpkin; +using Fr = Curve::ScalarField; +using IPA = IPA; +using OpeningPair = honk::pcs::OpeningPair; +using OpeningClaim = honk::pcs::OpeningClaim; +using Polynomial = Polynomial; +using CommitmentKey = honk::pcs::CommitmentKey; +using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey; + +constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10; +constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16; +std::shared_ptr> crs_factory( + new barretenberg::srs::factories::FileCrsFactory("../srs_db/grumpkin", 1 << 16)); + +auto ck = std::make_shared(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory); +auto vk = std::make_shared(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory); + +std::vector> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 - + MIN_POLYNOMIAL_DEGREE_LOG2 + 1); +std::vector opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1); + +void ipa_open(State& state) noexcept +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + for (auto _ : state) { + state.PauseTiming(); + size_t n = 1 << static_cast(state.range(0)); + // Construct the polynomial + Polynomial poly(n); + for (size_t i = 0; i < n; ++i) { + poly[i] = Fr::random_element(&engine); + } + auto x = Fr::random_element(&engine); + auto eval = poly.evaluate(x); + const OpeningPair opening_pair = { x, eval }; + const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) }; + // initialize empty prover transcript + auto prover_transcript = std::make_shared(); + state.ResumeTiming(); + // Compute proof + IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript); + // Store info for verifier + prover_transcripts[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript; + opening_claims[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim; + } +} +void ipa_verify(State& state) noexcept +{ + for (auto _ : state) { + state.PauseTiming(); + // Retrieve proofs + auto prover_transcript = prover_transcripts[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2]; + auto opening_claim = opening_claims[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2]; + // initialize verifier transcript from proof data + auto verifier_transcript = std::make_shared(prover_transcript->proof_data); + + state.ResumeTiming(); + auto result = IPA::verify(vk, opening_claim, verifier_transcript); + ASSERT(result); + } +} +} // namespace +BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2); +BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2); +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 85cfb13e668..460f5d6dc49 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -52,46 +52,67 @@ template class IPA { auto a_vec = polynomial; auto srs_elements = ck->srs->get_monomial_points(); std::vector G_vec_local(poly_degree); + // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - for (size_t i = 0; i < poly_degree * 2; i += 2) { - G_vec_local[i >> 1] = srs_elements[i]; - } + run_loop_in_parallel( + poly_degree, + [&G_vec_local, srs_elements](size_t start, size_t end) { + for (size_t i = start * 2; i < end * 2; i += 2) { + G_vec_local[i >> 1] = srs_elements[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + std::vector b_vec(poly_degree); - Fr b_power = 1; - for (size_t i = 0; i < poly_degree; i++) { - b_vec[i] = b_power; - b_power *= opening_pair.challenge; - } + run_loop_in_parallel( + poly_degree, + [&b_vec, &opening_pair](size_t start, size_t end) { + Fr b_power = opening_pair.challenge.pow(start); + for (size_t i = start; i < end; i++) { + b_vec[i] = b_power; + b_power *= opening_pair.challenge; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + // Iterate for log(poly_degree) rounds to compute the round commitments. auto log_poly_degree = static_cast(numeric::get_msb(poly_degree)); std::vector L_elements(log_poly_degree); std::vector R_elements(log_poly_degree); std::size_t round_size = poly_degree; - // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the - // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant - // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the - // loop can also be integrated. + // Perform IPA rounds for (size_t i = 0; i < log_poly_degree; i++) { round_size >>= 1; // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo > + std::mutex addition_lock; Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); - for (size_t j = 0; j < round_size; j++) { - inner_prod_L += a_vec[j] * b_vec[round_size + j]; - inner_prod_R += a_vec[round_size + j] * b_vec[j]; - } + // Run scalar product in parallel + run_loop_in_parallel( + round_size, + [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) { + Fr current_inner_prod_L = Fr::zero(); + Fr current_inner_prod_R = Fr::zero(); + for (size_t j = start; j < end; j++) { + current_inner_prod_L += a_vec[j] * b_vec[round_size + j]; + current_inner_prod_R += a_vec[round_size + j] * b_vec[j]; + } + addition_lock.lock(); + inner_prod_L += current_inner_prod_L; + inner_prod_R += current_inner_prod_R; + addition_lock.unlock(); + }, + /*no_multhreading_if_less_or_equal=*/8); + // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator - L_elements[i] = - // TODO(#473) - barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( - &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); + L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); L_elements[i] += aux_generator * inner_prod_L; // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator - // TODO(#473) R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state); R_elements[i] += aux_generator * inner_prod_R; @@ -104,23 +125,32 @@ template class IPA { const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index); const Fr round_challenge_inv = round_challenge.invert(); - std::vector G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size)); - std::vector G_hi(G_vec_local.begin() + static_cast(round_size), G_vec_local.end()); - G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv); - G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge); + auto G_lo = GroupElement::batch_mul_with_endomorphism( + std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size) }, + round_challenge_inv); + auto G_hi = GroupElement::batch_mul_with_endomorphism( + std::span{ G_vec_local.begin() + static_cast(round_size), + G_vec_local.begin() + static_cast(round_size * 2) }, + round_challenge); // Update the vectors a_vec, b_vec and G_vec. // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge - for (size_t j = 0; j < round_size; j++) { - a_vec[j] *= round_challenge; - a_vec[j] += round_challenge_inv * a_vec[round_size + j]; - b_vec[j] *= round_challenge_inv; - b_vec[j] += round_challenge * b_vec[round_size + j]; - - G_vec_local[j] = G_lo[j] + G_hi[j]; - } + run_loop_in_parallel( + round_size, + [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size]( + size_t start, size_t end) { + for (size_t j = start; j < end; j++) { + a_vec[j] *= round_challenge; + a_vec[j] += round_challenge_inv * a_vec[round_size + j]; + b_vec[j] *= round_challenge_inv; + b_vec[j] += round_challenge * b_vec[round_size + j]; + + G_vec_local[j] = G_lo[j] + G_hi[j]; + } + }, + /*no_multhreading_if_less_or_equal=*/4); } transcript->send_to_verifier("IPA:a_0", a_vec[0]); @@ -166,7 +196,7 @@ template class IPA { msm_scalars[2 * i] = round_challenges[i].sqr(); msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr(); } - // TODO(#473) + GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state); GroupElement C_zero = C_prime + LR_sums; @@ -188,29 +218,42 @@ template class IPA { // Compute G_zero // First construct s_vec std::vector s_vec(poly_degree); - for (size_t i = 0; i < poly_degree; i++) { - Fr s_vec_scalar = Fr::one(); - for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { - auto bit = (i >> j) & 1; - bool b = static_cast(bit); - if (b) { - s_vec_scalar *= round_challenges[log_poly_degree - 1 - j]; - } else { - s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; + run_loop_in_parallel( + poly_degree, + [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + Fr s_vec_scalar = Fr::one(); + for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { + auto bit = (i >> j) & 1; + bool b = static_cast(bit); + if (b) { + s_vec_scalar *= round_challenges[log_poly_degree - 1 - j]; + } else { + s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; + } + } + s_vec[i] = s_vec_scalar; } - } - s_vec[i] = s_vec_scalar; - } + }, + /*no_multhreading_if_less_or_equal=*/4); + auto srs_elements = vk->srs->get_monomial_points(); + // Copy the G_vector to local memory. std::vector G_vec_local(poly_degree); + // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - for (size_t i = 0; i < poly_degree * 2; i += 2) { - G_vec_local[i >> 1] = srs_elements[i]; - } - // TODO(#473) + run_loop_in_parallel( + poly_degree, + [&G_vec_local, srs_elements](size_t start, size_t end) { + for (size_t i = start * 2; i < end * 2; i += 2) { + G_vec_local[i >> 1] = srs_elements[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state); diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 6d3007c8739..232c30be69d 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function& func #endif #endif } + +/** + * @brief Split a loop into several loops running in parallel + * + * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function + * that should contain the work loop + * @param num_points Total number of elements + * @param func A function or lambda expression with a for loop inside, for example: + * [](size_t start, size_t end){for (size_t i=start; i& func, + size_t no_multhreading_if_less_or_equal) +{ + if (num_points <= no_multhreading_if_less_or_equal) { + func(0, num_points); + return; + } + // Get number of cpus we can split into + const size_t num_cpus = get_num_cpus(); + + // Compute the size of a single chunk + const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1); + // Parallelize over chunks + parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) { + // If num_points is small, sometimes we need fewer CPUs + if (chunk_size * chunk_index > num_points) { + return; + } + // Compute the current chunk size (can differ in case it's the last chunk) + size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size); + if (current_chunk_size == 0) { + return; + } + size_t start = chunk_index * chunk_size; + size_t end = chunk_index * chunk_size + current_chunk_size; + func(start, end); + }); +}; diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index 96e3df74092..fe799e1d46f 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2() } void parallel_for(size_t num_iterations, const std::function& func); +void run_loop_in_parallel(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal = 0); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 0b2d13761f4..420b4649856 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -92,7 +92,7 @@ template class alignas(32) element { static void batch_normalize(element* elements, size_t num_elements) noexcept; static std::vector> batch_mul_with_endomorphism( - const std::vector>& points, const Fr& exponent) noexcept; + const std::span>& points, const Fr& exponent) noexcept; Fq x; Fq y; diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index 2deae832391..da98fb74f8f 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -1,4 +1,5 @@ #pragma once +#include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/groups/element.hpp" #include "element.hpp" @@ -687,77 +688,130 @@ element element::mul_with_endomorphism(const Fr& exponent) return work_element; } +/** + * @brief Multiply each point by the same exponent + * + * @details We use the fact that all points are being multiplied by the same exponent to batch the operations (perform + * batch affine additions and doublings with batch inversion trick) + * + * @param points The span of individual points that need to be scaled + * @param exponent The scalar we multiply all the points by + * @return std::vector> Vector of new points where each point is exponentâ‹…points[i] + */ template std::vector> element::batch_mul_with_endomorphism( - const std::vector>& points, const Fr& exponent) noexcept + const std::span>& points, const Fr& exponent) noexcept { typedef affine_element affine_element; const size_t num_points = points.size(); + + // Space for temporary values std::vector scratch_space(num_points); // we can mutate rhs but NOT lhs! // output is stored in rhs - const auto batch_affine_add = [num_points, &scratch_space](const affine_element* lhs, affine_element* rhs) { - Fq batch_inversion_accumulator = Fq::one(); - - for (size_t i = 0; i < num_points; i += 1) { - scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 - rhs[i].x -= lhs[i].x; // x2 - x1 - rhs[i].y -= lhs[i].y; // y2 - y1 - rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (rhs[i].x); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - - for (size_t i = (num_points)-1; i < num_points; i -= 1) { - rhs[i].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= rhs[i].x; - rhs[i].x = rhs[i].y.sqr(); - rhs[i].x = rhs[i].x - (scratch_space[i]); // x3 = lambda_squared - x2 - // - x1 - scratch_space[i] = lhs[i].x - rhs[i].x; - scratch_space[i] *= rhs[i].y; - rhs[i].y = scratch_space[i] - lhs[i].y; - } + /** + * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_add_chunked = + [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); + + for (size_t i = 0; i < point_count; i += 1) { + personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 + rhs[i].x -= lhs[i].x; // x2 - x1 + rhs[i].y -= lhs[i].y; // y2 - y1 + rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old + batch_inversion_accumulator *= (rhs[i].x); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + for (size_t i = (point_count)-1; i < point_count; i -= 1) { + rhs[i].y *= batch_inversion_accumulator; // update accumulator + batch_inversion_accumulator *= rhs[i].x; + rhs[i].x = rhs[i].y.sqr(); + rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2 + // - x1 + personal_scratch_space[i] = lhs[i].x - rhs[i].x; + personal_scratch_space[i] *= rhs[i].y; + rhs[i].y = personal_scratch_space[i] - lhs[i].y; + } + }; + + /** + * @brief Perform batch affine addition in parallel + * + */ + const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, + affine_element* rhs) { + run_loop_in_parallel( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*no_multhreading_if_less_or_equal=*/4); }; - // double the elements in lhs - const auto batch_affine_double = [num_points, &scratch_space](affine_element* lhs) { - Fq batch_inversion_accumulator = Fq::one(); + /** + * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_double_chunked = + [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); - for (size_t i = 0; i < num_points; i += 1) { + for (size_t i = 0; i < point_count; i += 1) { - scratch_space[i] = lhs[i].x.sqr(); - scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i]; + personal_scratch_space[i] = lhs[i].x.sqr(); + personal_scratch_space[i] = + personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i]; - scratch_space[i] *= batch_inversion_accumulator; + personal_scratch_space[i] *= batch_inversion_accumulator; - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); + batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); - Fq temp; - for (size_t i = (num_points)-1; i < num_points; i -= 1) { + Fq temp; + for (size_t i = (point_count)-1; i < point_count; i -= 1) { - scratch_space[i] *= batch_inversion_accumulator; - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); + personal_scratch_space[i] *= batch_inversion_accumulator; + batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - temp = lhs[i].x; - lhs[i].x = scratch_space[i].sqr() - (lhs[i].x + lhs[i].x); - lhs[i].y = scratch_space[i] * (temp - lhs[i].x) - lhs[i].y; - } + temp = lhs[i].x; + lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x); + lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y; + } + }; + /** + * @brief Perform point doubling in parallel + * + */ + const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) { + run_loop_in_parallel( + num_points, + [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) { + batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start); + }, + /*no_multhreading_if_less_or_equal=*/4); }; - // Compute wnaf for scalar const Fr converted_scalar = exponent.from_montgomery_form(); + // If the scalar is zero, just set results to the point at infinity if (converted_scalar.is_zero()) { affine_element result{ Fq::zero(), Fq::zero() }; result.self_set_infinity(); - std::vector results; - for (size_t i = 0; i < num_points; ++i) { - results.emplace_back(result); - } + std::vector results(num_points); + run_loop_in_parallel( + num_points, + [&results, result](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + results[i] = result; + } + }, + /*no_multhreading_if_less_or_equal=*/16); return results; } @@ -768,90 +822,132 @@ std::vector> element::batch_mul_with_endomo for (auto& table : lookup_table) { table.resize(num_points); } + // Initialize first etnries in lookup table std::vector temp_point_vector(num_points); - for (size_t i = 0; i < num_points; ++i) { - temp_point_vector[i] = points[i]; - lookup_table[0][i] = points[i]; - } + run_loop_in_parallel( + num_points, + [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + temp_point_vector[i] = points[i]; + lookup_table[0][i] = points[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + + // Construct lookup table batch_affine_double(&temp_point_vector[0]); for (size_t j = 1; j < lookup_size; ++j) { - - for (size_t i = 0; i < num_points; ++i) { - lookup_table[j][i] = lookup_table[j - 1][i]; - } + run_loop_in_parallel( + num_points, + [j, &lookup_table](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + lookup_table[j][i] = lookup_table[j - 1][i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]); } uint64_t wnaf_table[num_rounds * 2]; Fr endo_scalar; + + // Split single scalar into 2 scalar in endo form Fr::split_into_endomorphism_scalars(converted_scalar, endo_scalar, *(Fr*)&endo_scalar.data[2]); // NOLINT bool skew = false; bool endo_skew = false; + // Construct wnaf forms from scalars wnaf::fixed_wnaf(&endo_scalar.data[0], &wnaf_table[0], skew, 0, 2, num_wnaf_bits); wnaf::fixed_wnaf(&endo_scalar.data[2], &wnaf_table[1], endo_skew, 0, 2, num_wnaf_bits); std::vector work_elements(num_points); + constexpr Fq beta = Fq::cube_root_of_unity(); uint64_t wnaf_entry = 0; uint64_t index = 0; bool sign = 0; - Fq beta = Fq::cube_root_of_unity(); - - for (size_t i = 0; i < 2; ++i) { - for (size_t j = 0; j < num_points; ++j) { - wnaf_entry = wnaf_table[i]; - index = wnaf_entry & 0x0fffffffU; - sign = static_cast((wnaf_entry >> 31) & 1); - const bool is_odd = ((i & 1) == 1); - auto to_add = lookup_table[static_cast(index)][j]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - if (i == 0) { - work_elements[j] = to_add; - } else { - temp_point_vector[j] = to_add; - } - } + // Prepare elements for the first batch addition + for (size_t j = 0; j < 2; ++j) { + wnaf_entry = wnaf_table[j]; + index = wnaf_entry & 0x0fffffffU; + sign = static_cast((wnaf_entry >> 31) & 1); + const bool is_odd = ((j & 1) == 1); + run_loop_in_parallel( + num_points, + [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start, + size_t end) { + for (size_t i = start; i < end; ++i) { + + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; + } + if (j == 0) { + work_elements[i] = to_add; + } else { + temp_point_vector[i] = to_add; + } + } + }, + /*no_multhreading_if_less_or_equal=*/16); } + // First cycle of addition batch_affine_add(&temp_point_vector[0], &work_elements[0]); - - for (size_t i = 2; i < num_rounds * 2; ++i) { - wnaf_entry = wnaf_table[i]; + // Run through SM logic in wnaf form (excluding the skew) + for (size_t j = 2; j < num_rounds * 2; ++j) { + wnaf_entry = wnaf_table[j]; index = wnaf_entry & 0x0fffffffU; sign = static_cast((wnaf_entry >> 31) & 1); - const bool is_odd = ((i & 1) == 1); + const bool is_odd = ((j & 1) == 1); if (!is_odd) { for (size_t k = 0; k < 4; ++k) { batch_affine_double(&work_elements[0]); } } - for (size_t j = 0; j < num_points; ++j) { - auto to_add = lookup_table[static_cast(index)][j]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - temp_point_vector[j] = to_add; - } + run_loop_in_parallel( + num_points, + [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; + } + temp_point_vector[i] = to_add; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } + // Apply skew for the first endo scalar if (skew) { - for (size_t j = 0; j < num_points; ++j) { - temp_point_vector[j] = -lookup_table[0][j]; - } + run_loop_in_parallel( + num_points, + [&lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + temp_point_vector[i] = -lookup_table[0][i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } - + // Apply skew for the second endo scalar if (endo_skew) { - for (size_t j = 0; j < num_points; ++j) { - temp_point_vector[j] = lookup_table[0][j]; - temp_point_vector[j].x *= beta; - } + run_loop_in_parallel( + num_points, + [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + temp_point_vector[i] = lookup_table[0][i]; + temp_point_vector[i].x *= beta; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); }