diff --git a/cpp/src/barretenberg/benchmark/CMakeLists.txt b/cpp/src/barretenberg/benchmark/CMakeLists.txt index 49bcb251f0..4a13dff546 100644 --- a/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(ipa_bench) add_subdirectory(decrypt_bench) add_subdirectory(pippenger_bench) add_subdirectory(plonk_bench) diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt new file mode 100644 index 0000000000..6e66ce2cf3 --- /dev/null +++ b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt @@ -0,0 +1,18 @@ +# Each source represents a separate benchmark suite +set(BENCHMARK_SOURCES + ipa.bench.cpp +) + +# Required libraries for benchmark suites +set(LINKED_LIBRARIES + benchmark::benchmark + ultra_honk +) + +# Add executable and custom target for each suite, e.g. ultra_honk_bench +foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES}) + get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension + add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE}) + target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES}) + add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endforeach() \ No newline at end of file diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp new file mode 100644 index 0000000000..9d4f7028e5 --- /dev/null +++ b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp @@ -0,0 +1,73 @@ +#include "barretenberg/commitment_schemes/ipa/ipa.hpp" +#include + +using namespace benchmark; +using namespace barretenberg; +using namespace proof_system; +using namespace proof_system::honk::pcs::ipa; +namespace { +using Curve = curve::Grumpkin; +using Fr = Curve::ScalarField; +using IPA = IPA; +using OpeningPair = honk::pcs::OpeningPair; +using OpeningClaim = honk::pcs::OpeningClaim; +using Polynomial = Polynomial; +using CommitmentKey = honk::pcs::CommitmentKey; +using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey; + +constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10; +constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16; +std::shared_ptr> crs_factory( + new barretenberg::srs::factories::FileCrsFactory("../srs_db/grumpkin", 1 << 16)); + +auto ck = std::make_shared(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory); +auto vk = std::make_shared(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory); + +std::vector> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 - + MIN_POLYNOMIAL_DEGREE_LOG2 + 1); +std::vector opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1); + +void ipa_open(State& state) noexcept +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + for (auto _ : state) { + state.PauseTiming(); + size_t n = 1 << static_cast(state.range(0)); + // Construct the polynomial + Polynomial poly(n); + for (size_t i = 0; i < n; ++i) { + poly[i] = Fr::random_element(&engine); + } + auto x = Fr::random_element(&engine); + auto eval = poly.evaluate(x); + const OpeningPair opening_pair = { x, eval }; + const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) }; + // initialize empty prover transcript + auto prover_transcript = std::make_shared(); + state.ResumeTiming(); + // Compute proof + IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript); + // Store info for verifier + prover_transcripts[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript; + opening_claims[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim; + } +} +void ipa_verify(State& state) noexcept +{ + for (auto _ : state) { + state.PauseTiming(); + // Retrieve proofs + auto prover_transcript = prover_transcripts[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2]; + auto opening_claim = opening_claims[static_cast(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2]; + // initialize verifier transcript from proof data + auto verifier_transcript = std::make_shared(prover_transcript->proof_data); + + state.ResumeTiming(); + auto result = IPA::verify(vk, opening_claim, verifier_transcript); + ASSERT(result); + } +} +} // namespace +BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2); +BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2); +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 85cfb13e66..460f5d6dc4 100644 --- a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -52,46 +52,67 @@ template class IPA { auto a_vec = polynomial; auto srs_elements = ck->srs->get_monomial_points(); std::vector G_vec_local(poly_degree); + // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - for (size_t i = 0; i < poly_degree * 2; i += 2) { - G_vec_local[i >> 1] = srs_elements[i]; - } + run_loop_in_parallel( + poly_degree, + [&G_vec_local, srs_elements](size_t start, size_t end) { + for (size_t i = start * 2; i < end * 2; i += 2) { + G_vec_local[i >> 1] = srs_elements[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + std::vector b_vec(poly_degree); - Fr b_power = 1; - for (size_t i = 0; i < poly_degree; i++) { - b_vec[i] = b_power; - b_power *= opening_pair.challenge; - } + run_loop_in_parallel( + poly_degree, + [&b_vec, &opening_pair](size_t start, size_t end) { + Fr b_power = opening_pair.challenge.pow(start); + for (size_t i = start; i < end; i++) { + b_vec[i] = b_power; + b_power *= opening_pair.challenge; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + // Iterate for log(poly_degree) rounds to compute the round commitments. auto log_poly_degree = static_cast(numeric::get_msb(poly_degree)); std::vector L_elements(log_poly_degree); std::vector R_elements(log_poly_degree); std::size_t round_size = poly_degree; - // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the - // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant - // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the - // loop can also be integrated. + // Perform IPA rounds for (size_t i = 0; i < log_poly_degree; i++) { round_size >>= 1; // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo > + std::mutex addition_lock; Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); - for (size_t j = 0; j < round_size; j++) { - inner_prod_L += a_vec[j] * b_vec[round_size + j]; - inner_prod_R += a_vec[round_size + j] * b_vec[j]; - } + // Run scalar product in parallel + run_loop_in_parallel( + round_size, + [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) { + Fr current_inner_prod_L = Fr::zero(); + Fr current_inner_prod_R = Fr::zero(); + for (size_t j = start; j < end; j++) { + current_inner_prod_L += a_vec[j] * b_vec[round_size + j]; + current_inner_prod_R += a_vec[round_size + j] * b_vec[j]; + } + addition_lock.lock(); + inner_prod_L += current_inner_prod_L; + inner_prod_R += current_inner_prod_R; + addition_lock.unlock(); + }, + /*no_multhreading_if_less_or_equal=*/8); + // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator - L_elements[i] = - // TODO(#473) - barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( - &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); + L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); L_elements[i] += aux_generator * inner_prod_L; // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator - // TODO(#473) R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state); R_elements[i] += aux_generator * inner_prod_R; @@ -104,23 +125,32 @@ template class IPA { const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index); const Fr round_challenge_inv = round_challenge.invert(); - std::vector G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size)); - std::vector G_hi(G_vec_local.begin() + static_cast(round_size), G_vec_local.end()); - G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv); - G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge); + auto G_lo = GroupElement::batch_mul_with_endomorphism( + std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast(round_size) }, + round_challenge_inv); + auto G_hi = GroupElement::batch_mul_with_endomorphism( + std::span{ G_vec_local.begin() + static_cast(round_size), + G_vec_local.begin() + static_cast(round_size * 2) }, + round_challenge); // Update the vectors a_vec, b_vec and G_vec. // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge - for (size_t j = 0; j < round_size; j++) { - a_vec[j] *= round_challenge; - a_vec[j] += round_challenge_inv * a_vec[round_size + j]; - b_vec[j] *= round_challenge_inv; - b_vec[j] += round_challenge * b_vec[round_size + j]; - - G_vec_local[j] = G_lo[j] + G_hi[j]; - } + run_loop_in_parallel( + round_size, + [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size]( + size_t start, size_t end) { + for (size_t j = start; j < end; j++) { + a_vec[j] *= round_challenge; + a_vec[j] += round_challenge_inv * a_vec[round_size + j]; + b_vec[j] *= round_challenge_inv; + b_vec[j] += round_challenge * b_vec[round_size + j]; + + G_vec_local[j] = G_lo[j] + G_hi[j]; + } + }, + /*no_multhreading_if_less_or_equal=*/4); } transcript->send_to_verifier("IPA:a_0", a_vec[0]); @@ -166,7 +196,7 @@ template class IPA { msm_scalars[2 * i] = round_challenges[i].sqr(); msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr(); } - // TODO(#473) + GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state); GroupElement C_zero = C_prime + LR_sums; @@ -188,29 +218,42 @@ template class IPA { // Compute G_zero // First construct s_vec std::vector s_vec(poly_degree); - for (size_t i = 0; i < poly_degree; i++) { - Fr s_vec_scalar = Fr::one(); - for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { - auto bit = (i >> j) & 1; - bool b = static_cast(bit); - if (b) { - s_vec_scalar *= round_challenges[log_poly_degree - 1 - j]; - } else { - s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; + run_loop_in_parallel( + poly_degree, + [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + Fr s_vec_scalar = Fr::one(); + for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) { + auto bit = (i >> j) & 1; + bool b = static_cast(bit); + if (b) { + s_vec_scalar *= round_challenges[log_poly_degree - 1 - j]; + } else { + s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; + } + } + s_vec[i] = s_vec_scalar; } - } - s_vec[i] = s_vec_scalar; - } + }, + /*no_multhreading_if_less_or_equal=*/4); + auto srs_elements = vk->srs->get_monomial_points(); + // Copy the G_vector to local memory. std::vector G_vec_local(poly_degree); + // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - for (size_t i = 0; i < poly_degree * 2; i += 2) { - G_vec_local[i >> 1] = srs_elements[i]; - } - // TODO(#473) + run_loop_in_parallel( + poly_degree, + [&G_vec_local, srs_elements](size_t start, size_t end) { + for (size_t i = start * 2; i < end * 2; i += 2) { + G_vec_local[i >> 1] = srs_elements[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state); diff --git a/cpp/src/barretenberg/common/thread.cpp b/cpp/src/barretenberg/common/thread.cpp index 6d3007c873..232c30be69 100644 --- a/cpp/src/barretenberg/common/thread.cpp +++ b/cpp/src/barretenberg/common/thread.cpp @@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function& func #endif #endif } + +/** + * @brief Split a loop into several loops running in parallel + * + * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function + * that should contain the work loop + * @param num_points Total number of elements + * @param func A function or lambda expression with a for loop inside, for example: + * [](size_t start, size_t end){for (size_t i=start; i& func, + size_t no_multhreading_if_less_or_equal) +{ + if (num_points <= no_multhreading_if_less_or_equal) { + func(0, num_points); + return; + } + // Get number of cpus we can split into + const size_t num_cpus = get_num_cpus(); + + // Compute the size of a single chunk + const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1); + // Parallelize over chunks + parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) { + // If num_points is small, sometimes we need fewer CPUs + if (chunk_size * chunk_index > num_points) { + return; + } + // Compute the current chunk size (can differ in case it's the last chunk) + size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size); + if (current_chunk_size == 0) { + return; + } + size_t start = chunk_index * chunk_size; + size_t end = chunk_index * chunk_size + current_chunk_size; + func(start, end); + }); +}; diff --git a/cpp/src/barretenberg/common/thread.hpp b/cpp/src/barretenberg/common/thread.hpp index 96e3df7409..fe799e1d46 100644 --- a/cpp/src/barretenberg/common/thread.hpp +++ b/cpp/src/barretenberg/common/thread.hpp @@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2() } void parallel_for(size_t num_iterations, const std::function& func); +void run_loop_in_parallel(size_t num_points, + const std::function& func, + size_t no_multhreading_if_less_or_equal = 0); \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/groups/element.hpp b/cpp/src/barretenberg/ecc/groups/element.hpp index 0b2d13761f..420b464985 100644 --- a/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/cpp/src/barretenberg/ecc/groups/element.hpp @@ -92,7 +92,7 @@ template class alignas(32) element { static void batch_normalize(element* elements, size_t num_elements) noexcept; static std::vector> batch_mul_with_endomorphism( - const std::vector>& points, const Fr& exponent) noexcept; + const std::span>& points, const Fr& exponent) noexcept; Fq x; Fq y; diff --git a/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/cpp/src/barretenberg/ecc/groups/element_impl.hpp index 2deae83239..da98fb74f8 100644 --- a/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -1,4 +1,5 @@ #pragma once +#include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/groups/element.hpp" #include "element.hpp" @@ -687,77 +688,130 @@ element element::mul_with_endomorphism(const Fr& exponent) return work_element; } +/** + * @brief Multiply each point by the same exponent + * + * @details We use the fact that all points are being multiplied by the same exponent to batch the operations (perform + * batch affine additions and doublings with batch inversion trick) + * + * @param points The span of individual points that need to be scaled + * @param exponent The scalar we multiply all the points by + * @return std::vector> Vector of new points where each point is exponentâ‹…points[i] + */ template std::vector> element::batch_mul_with_endomorphism( - const std::vector>& points, const Fr& exponent) noexcept + const std::span>& points, const Fr& exponent) noexcept { typedef affine_element affine_element; const size_t num_points = points.size(); + + // Space for temporary values std::vector scratch_space(num_points); // we can mutate rhs but NOT lhs! // output is stored in rhs - const auto batch_affine_add = [num_points, &scratch_space](const affine_element* lhs, affine_element* rhs) { - Fq batch_inversion_accumulator = Fq::one(); - - for (size_t i = 0; i < num_points; i += 1) { - scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 - rhs[i].x -= lhs[i].x; // x2 - x1 - rhs[i].y -= lhs[i].y; // y2 - y1 - rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (rhs[i].x); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - - for (size_t i = (num_points)-1; i < num_points; i -= 1) { - rhs[i].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= rhs[i].x; - rhs[i].x = rhs[i].y.sqr(); - rhs[i].x = rhs[i].x - (scratch_space[i]); // x3 = lambda_squared - x2 - // - x1 - scratch_space[i] = lhs[i].x - rhs[i].x; - scratch_space[i] *= rhs[i].y; - rhs[i].y = scratch_space[i] - lhs[i].y; - } + /** + * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_add_chunked = + [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); + + for (size_t i = 0; i < point_count; i += 1) { + personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 + rhs[i].x -= lhs[i].x; // x2 - x1 + rhs[i].y -= lhs[i].y; // y2 - y1 + rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old + batch_inversion_accumulator *= (rhs[i].x); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + for (size_t i = (point_count)-1; i < point_count; i -= 1) { + rhs[i].y *= batch_inversion_accumulator; // update accumulator + batch_inversion_accumulator *= rhs[i].x; + rhs[i].x = rhs[i].y.sqr(); + rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2 + // - x1 + personal_scratch_space[i] = lhs[i].x - rhs[i].x; + personal_scratch_space[i] *= rhs[i].y; + rhs[i].y = personal_scratch_space[i] - lhs[i].y; + } + }; + + /** + * @brief Perform batch affine addition in parallel + * + */ + const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, + affine_element* rhs) { + run_loop_in_parallel( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*no_multhreading_if_less_or_equal=*/4); }; - // double the elements in lhs - const auto batch_affine_double = [num_points, &scratch_space](affine_element* lhs) { - Fq batch_inversion_accumulator = Fq::one(); + /** + * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_double_chunked = + [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); - for (size_t i = 0; i < num_points; i += 1) { + for (size_t i = 0; i < point_count; i += 1) { - scratch_space[i] = lhs[i].x.sqr(); - scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i]; + personal_scratch_space[i] = lhs[i].x.sqr(); + personal_scratch_space[i] = + personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i]; - scratch_space[i] *= batch_inversion_accumulator; + personal_scratch_space[i] *= batch_inversion_accumulator; - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - } - batch_inversion_accumulator = batch_inversion_accumulator.invert(); + batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); - Fq temp; - for (size_t i = (num_points)-1; i < num_points; i -= 1) { + Fq temp; + for (size_t i = (point_count)-1; i < point_count; i -= 1) { - scratch_space[i] *= batch_inversion_accumulator; - batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); + personal_scratch_space[i] *= batch_inversion_accumulator; + batch_inversion_accumulator *= (lhs[i].y + lhs[i].y); - temp = lhs[i].x; - lhs[i].x = scratch_space[i].sqr() - (lhs[i].x + lhs[i].x); - lhs[i].y = scratch_space[i] * (temp - lhs[i].x) - lhs[i].y; - } + temp = lhs[i].x; + lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x); + lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y; + } + }; + /** + * @brief Perform point doubling in parallel + * + */ + const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) { + run_loop_in_parallel( + num_points, + [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) { + batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start); + }, + /*no_multhreading_if_less_or_equal=*/4); }; - // Compute wnaf for scalar const Fr converted_scalar = exponent.from_montgomery_form(); + // If the scalar is zero, just set results to the point at infinity if (converted_scalar.is_zero()) { affine_element result{ Fq::zero(), Fq::zero() }; result.self_set_infinity(); - std::vector results; - for (size_t i = 0; i < num_points; ++i) { - results.emplace_back(result); - } + std::vector results(num_points); + run_loop_in_parallel( + num_points, + [&results, result](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + results[i] = result; + } + }, + /*no_multhreading_if_less_or_equal=*/16); return results; } @@ -768,90 +822,132 @@ std::vector> element::batch_mul_with_endomo for (auto& table : lookup_table) { table.resize(num_points); } + // Initialize first etnries in lookup table std::vector temp_point_vector(num_points); - for (size_t i = 0; i < num_points; ++i) { - temp_point_vector[i] = points[i]; - lookup_table[0][i] = points[i]; - } + run_loop_in_parallel( + num_points, + [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + temp_point_vector[i] = points[i]; + lookup_table[0][i] = points[i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); + + // Construct lookup table batch_affine_double(&temp_point_vector[0]); for (size_t j = 1; j < lookup_size; ++j) { - - for (size_t i = 0; i < num_points; ++i) { - lookup_table[j][i] = lookup_table[j - 1][i]; - } + run_loop_in_parallel( + num_points, + [j, &lookup_table](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + lookup_table[j][i] = lookup_table[j - 1][i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]); } uint64_t wnaf_table[num_rounds * 2]; Fr endo_scalar; + + // Split single scalar into 2 scalar in endo form Fr::split_into_endomorphism_scalars(converted_scalar, endo_scalar, *(Fr*)&endo_scalar.data[2]); // NOLINT bool skew = false; bool endo_skew = false; + // Construct wnaf forms from scalars wnaf::fixed_wnaf(&endo_scalar.data[0], &wnaf_table[0], skew, 0, 2, num_wnaf_bits); wnaf::fixed_wnaf(&endo_scalar.data[2], &wnaf_table[1], endo_skew, 0, 2, num_wnaf_bits); std::vector work_elements(num_points); + constexpr Fq beta = Fq::cube_root_of_unity(); uint64_t wnaf_entry = 0; uint64_t index = 0; bool sign = 0; - Fq beta = Fq::cube_root_of_unity(); - - for (size_t i = 0; i < 2; ++i) { - for (size_t j = 0; j < num_points; ++j) { - wnaf_entry = wnaf_table[i]; - index = wnaf_entry & 0x0fffffffU; - sign = static_cast((wnaf_entry >> 31) & 1); - const bool is_odd = ((i & 1) == 1); - auto to_add = lookup_table[static_cast(index)][j]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - if (i == 0) { - work_elements[j] = to_add; - } else { - temp_point_vector[j] = to_add; - } - } + // Prepare elements for the first batch addition + for (size_t j = 0; j < 2; ++j) { + wnaf_entry = wnaf_table[j]; + index = wnaf_entry & 0x0fffffffU; + sign = static_cast((wnaf_entry >> 31) & 1); + const bool is_odd = ((j & 1) == 1); + run_loop_in_parallel( + num_points, + [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start, + size_t end) { + for (size_t i = start; i < end; ++i) { + + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; + } + if (j == 0) { + work_elements[i] = to_add; + } else { + temp_point_vector[i] = to_add; + } + } + }, + /*no_multhreading_if_less_or_equal=*/16); } + // First cycle of addition batch_affine_add(&temp_point_vector[0], &work_elements[0]); - - for (size_t i = 2; i < num_rounds * 2; ++i) { - wnaf_entry = wnaf_table[i]; + // Run through SM logic in wnaf form (excluding the skew) + for (size_t j = 2; j < num_rounds * 2; ++j) { + wnaf_entry = wnaf_table[j]; index = wnaf_entry & 0x0fffffffU; sign = static_cast((wnaf_entry >> 31) & 1); - const bool is_odd = ((i & 1) == 1); + const bool is_odd = ((j & 1) == 1); if (!is_odd) { for (size_t k = 0; k < 4; ++k) { batch_affine_double(&work_elements[0]); } } - for (size_t j = 0; j < num_points; ++j) { - auto to_add = lookup_table[static_cast(index)][j]; - to_add.y.self_conditional_negate(sign ^ is_odd); - if (is_odd) { - to_add.x *= beta; - } - temp_point_vector[j] = to_add; - } + run_loop_in_parallel( + num_points, + [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + auto to_add = lookup_table[static_cast(index)][i]; + to_add.y.self_conditional_negate(sign ^ is_odd); + if (is_odd) { + to_add.x *= beta; + } + temp_point_vector[i] = to_add; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } + // Apply skew for the first endo scalar if (skew) { - for (size_t j = 0; j < num_points; ++j) { - temp_point_vector[j] = -lookup_table[0][j]; - } + run_loop_in_parallel( + num_points, + [&lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + temp_point_vector[i] = -lookup_table[0][i]; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); } - + // Apply skew for the second endo scalar if (endo_skew) { - for (size_t j = 0; j < num_points; ++j) { - temp_point_vector[j] = lookup_table[0][j]; - temp_point_vector[j].x *= beta; - } + run_loop_in_parallel( + num_points, + [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + + temp_point_vector[i] = lookup_table[0][i]; + temp_point_vector[i].x *= beta; + } + }, + /*no_multhreading_if_less_or_equal=*/16); batch_affine_add(&temp_point_vector[0], &work_elements[0]); }