From fd1f619f443916c172b6311aa71a84153145ef4d Mon Sep 17 00:00:00 2001 From: Innokentii Sennovskii Date: Sat, 13 Jan 2024 16:38:01 +0000 Subject: [PATCH] feat: Benchmarks for basic functionality and IPA improvements (#4004) This PR: 1. Adds benchmarks to gauge how much operations such as finite field addition and splitting into parallel threads with parallel_for cost 2. Introduces a new function `run_loop_in_parallel_if_effective` which calculates if it's worth splitting the workload into separate threads before doing it 3. Updates IPA and the batch_mul_with_endomorphism to use this function 4. Adds batch_affine_add method to element and uses it to make IPA faster. --- .../src/barretenberg/benchmark/CMakeLists.txt | 3 +- .../benchmark/basics_bench/CMakeLists.txt | 18 + .../basics_bench/analyse_all_benchmarks.py | 88 ++++ .../benchmark/basics_bench/basics.bench.cpp | 382 ++++++++++++++++++ .../basics_bench/single_benchmark_analysis.py | 46 +++ .../commitment_schemes/ipa/ipa.hpp | 47 ++- .../cpp/src/barretenberg/common/thread.cpp | 77 ++++ .../cpp/src/barretenberg/common/thread.hpp | 11 +- .../src/barretenberg/ecc/groups/element.hpp | 3 + .../barretenberg/ecc/groups/element_impl.hpp | 192 +++++++-- 10 files changed, 817 insertions(+), 50 deletions(-) create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt index 968acb82531..285f2bb5937 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt +++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt @@ -1,9 +1,10 @@ -add_subdirectory(ipa_bench) add_subdirectory(decrypt_bench) +add_subdirectory(ipa_bench) add_subdirectory(pippenger_bench) add_subdirectory(plonk_bench) add_subdirectory(ultra_bench) add_subdirectory(goblin_bench) +add_subdirectory(basics_bench) add_subdirectory(relations_bench) add_subdirectory(widgets_bench) add_subdirectory(protogalaxy_bench) \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt new file mode 100644 index 00000000000..d23c4f6597f --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt @@ -0,0 +1,18 @@ +# Each source represents a separate benchmark suite +set(BENCHMARK_SOURCES + basics.bench.cpp +) + +# Required libraries for benchmark suites +set(LINKED_LIBRARIES + benchmark::benchmark + ecc +) + +# Add executable and custom target for each suite, e.g. ultra_honk_bench +foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES}) + get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension + add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE}) + target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES}) + add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +endforeach() \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py new file mode 100644 index 00000000000..5edebdb815c --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 +""" +Tool for analysing several benchmarks from basics_bench to calculate operation timings +For example, in src directory: +python3 ../src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py -f bin/basics_bench +""" +import argparse +import subprocess +import tempfile +from single_benchmark_analysis import evaluate_benchmark_from_file +import os + +# Some of the benchmarks use other operations to randomise the procedure, so we need to subtract the results +filter_rules={ + "sequential_copy":"cycle_waste", + "cycle_waste":None, + "parallel_for_field_element_addition:":None, + "ff_addition":"cycle_waste", + "ff_multiplication":"cycle_waste", + "ff_sqr":"cycle_waste", + "ff_invert":"ff_addition", + "ff_to_montgomery":"cycle_waste", + "ff_from_montgomery":"cycle_waste", + "ff_reduce":"ff_addition", + "projective_point_addition":"cycle_waste", + "projective_point_accidental_doubling":"cycle_waste", + "projective_point_doubling":"cycle_waste", + "scalar_multiplication":"ff_addition", +} +def get_benchmarks(filename): + """ + Get a list of benchmarks from the binary + """ + result=subprocess.run([filename,"--benchmark_list_tests"],capture_output=True) + result.check_returncode() + output_lines=result.stdout.splitlines() + benchmark_names=set([x.decode().split('/')[0] for x in output_lines]) + return sorted(list(benchmark_names)) + +def run_benchmarks(filename,bnames): + """ + Run benchmarks for each type and collect results + """ + benchmark_results=dict() + for bname in bnames: + output_file=tempfile.mktemp() + result=subprocess.run([filename,f"--benchmark_filter={bname}.*",f"--benchmark_out={output_file}","--benchmark_out_format=csv"]) + result.check_returncode() + benchmark_result=evaluate_benchmark_from_file(output_file)*1000 + benchmark_results[bname]=benchmark_result + print (f"Benchmark {bname} unfiltered: {benchmark_result} ns") + os.remove(output_file) + + return benchmark_results + +def filter_benchmarks(benchmark_results): + """ + Apply filtering rules and print the benchmarks + """ + global filter_rules + print ("Filtered benchmark results:") + max_len=0 + for bname in sorted(benchmark_results.keys()): + if len(bname)>max_len: + max_len=len(bname) + for bname in sorted(benchmark_results.keys()): + if bname not in filter_rules.keys() or filter_rules[bname]==None: + print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]:.1f}") + else: + print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]-benchmark_results[filter_rules[bname]]:.1f}") + +if __name__=="__main__": + parser=argparse.ArgumentParser(description='Run all the individual benchmarks',epilog='This expects a single file with a single type of benchmark /i') + parser.add_argument("-f","--file",dest="filename",required=True,help="run benchmark FILE", metavar="FILE") + args=parser.parse_args() + filename=args.filename + if filename==None: + parser.print_help() + exit() + benchmark_names=get_benchmarks(filename) + print("Will run the following benchmarks:") + for bname in benchmark_names: + print(f'\t{bname}') + unfiltered_results=run_benchmarks(filename,benchmark_names) + filter_benchmarks(unfiltered_results) + + + \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp new file mode 100644 index 00000000000..32f6082d3a8 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp @@ -0,0 +1,382 @@ +/** + * @file parallel.bench.cpp + * @author Rumata888 + * @brief Simple and not too strict benchmarks for most basic operations used in barretenberg + * @details Filtered benchmark results (nanoseconds): + * cycle_waste: 0.5 + * ff_addition: 3.8 + * ff_from_montgomery: 19.1 + * ff_invert: 7001.3 + * ff_multiplication: 21.3 + * ff_reduce: 5.1 + * ff_sqr: 17.9 + * ff_to_montgomery: 39.1 + * parallel_for_field_element_addition: 198000~388000 (The number is somewhat dependent on the number of cores + * used) + * projective_point_accidental_doubling: 347.6 + * projective_point_addition: 348.6 + * projective_point_doubling: 194.2 + * scalar_multiplication: 50060.1 + * sequential_copy: 3.3 + * + */ +#include "barretenberg/common/thread.hpp" +#include "barretenberg/ecc/curves/bn254/bn254.hpp" +#include + +using namespace benchmark; +using namespace barretenberg; +namespace { +using Curve = curve::BN254; +using Fr = Curve::ScalarField; +#define MAX_REPETITION_LOG 12 + +/** + * @brief Benchmark for evaluating the cost of starting parallel_for + * + * @details It seems parallel_for takes ~400 microseconds to start when we use all the cores. When it's just 1 it's 200 + * microseconds. The dependency is not exactly linear, so in code we use the largest value for convenience + * @param state + */ +void parallel_for_field_element_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + size_t num_cpus = get_num_cpus(); + std::vector> copy_vector(num_cpus); + for (size_t i = 0; i < num_cpus; i++) { + for (size_t j = 0; j < 2; j++) { + copy_vector[i].emplace_back(Fr::random_element(&engine)); + copy_vector[i].emplace_back(Fr::random_element(&engine)); + } + } + for (auto _ : state) { + state.PauseTiming(); + size_t num_external_cycles = 1 << static_cast(state.range(0)); + size_t num_internal_cycles = 1 << (MAX_REPETITION_LOG - static_cast(state.range(0))); + state.ResumeTiming(); + for (size_t i = 0; i < num_external_cycles; i++) { + parallel_for(num_cpus, [num_internal_cycles, ©_vector](size_t index) { + for (size_t i = 0; i < num_internal_cycles; i++) { + copy_vector[index][i & 1] += copy_vector[index][1 - (i & 1)]; + } + }); + } + } +} + +/** + * @brief Evaluate how much finite addition costs (in cache) + * + *@details ~4 ns if we subtract i++ operation + * @param state + */ +void ff_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] += copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much finite field multiplication costs (in cache) + * + *@details ~21 ns if we subtract i++ operation + * @param state + */ +void ff_multiplication(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] *= copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much finite field squaring costs (in cache) + * + *@details ~18 ns if we subtract i++ operation + * @param state + */ +void ff_sqr(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Fr::random_element(&engine)); + copy_vector.emplace_back(Fr::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] = copy_vector[0].sqr(); + } + } +} + +/** + * @brief Evaluate how much finite field inversion costs (in cache) + * + *@details ~7100 ns if we subtract addition and i++ operation + * @param state + */ +void ff_invert(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = (element + Fr::one()).invert(); + } + } +} + +/** + * @brief Evaluate how much conversion to montgomery costs (in cache) + * + *@details ~39 ns if we subtract i++ operation + * @param state + */ +void ff_to_montgomery(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element.to_montgomery_form(); + } + } +} +/** + * @brief Evaluate how much conversion from montgomery costs (in cache) + * + *@details ~19 ns if we subtract i++ operation + * @param state + */ +void ff_from_montgomery(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element.from_montgomery_form(); + } + } +} + +/** + * @brief Evaluate how much reduction costs (in cache) + * + *@details ~5 ns if we subtract addition and i++ operation + * @param state + */ +void ff_reduce(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + auto element = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = (element + element).reduce_once(); + } + } +} + +/** + * @brief Evaluate how much projective point addition costs (in cache) + * + *@details ~350 ns if we subtract i++ operation + * @param state + */ +void projective_point_addition(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[i & 1] += copy_vector[1 - (i & 1)]; + } + } +} + +/** + * @brief Evaluate how much projective point doubling costs when we trigger it through addition (in cache) + * + *@details ~354 ns if we subtract i++ operation + * @param state + */ +void projective_point_accidental_doubling(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] += copy_vector[0]; + } + } +} + +/** + * @brief Evaluate how much projective point doubling costs (in cache) + * + *@details ~195 ns if we subtract i++ operation + * @param state + */ +void projective_point_doubling(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + std::vector copy_vector(2); + for (size_t j = 0; j < 2; j++) { + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + copy_vector.emplace_back(Curve::Element::random_element(&engine)); + } + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + copy_vector[0] = copy_vector[0].dbl(); + } + } +} + +/** + * @brief Evaluate how much scalar multiplication costs (in cache) + * + *@details ~50000 ns + * @param state + */ +void scalar_multiplication(State& state) +{ + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + Curve::Element element = Curve::Element::random_element(&engine); + Fr scalar = Fr::random_element(&engine); + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + element = element * scalar; + scalar += scalar; + } + } +} +/** + * @brief Evaluate how much running the loop costs in benchmarks + * + * @details 0.6~0.7 ns per cycle + * @param state + */ +void cycle_waste(State& state) +{ + + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + state.ResumeTiming(); + for (volatile size_t i = 0; i < num_cycles;) { + i = i + 1; + } + } +} + +/** + * @brief Evaluate how much copying memory for large vectors costs + * + * @details 5 ns per cycle + * @param state + */ +void sequential_copy(State& state) +{ + + numeric::random::Engine& engine = numeric::random::get_debug_engine(); + for (auto _ : state) { + state.PauseTiming(); + size_t num_cycles = 1 << static_cast(state.range(0)); + std::vector input(num_cycles); + for (size_t i = 0; i < num_cycles; i++) { + *(uint256_t*)&input[i] = engine.get_random_uint256(); + } + std::vector output(num_cycles); + + state.ResumeTiming(); + for (size_t i = 0; i < num_cycles; i++) { + output[i] = input[i]; + } + } +} +} // namespace + +BENCHMARK(parallel_for_field_element_addition)->Unit(kMicrosecond)->DenseRange(0, MAX_REPETITION_LOG); +BENCHMARK(ff_addition)->Unit(kMicrosecond)->DenseRange(12, 30); +BENCHMARK(ff_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_sqr)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_invert)->Unit(kMicrosecond)->DenseRange(12, 19); +BENCHMARK(ff_to_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_from_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27); +BENCHMARK(ff_reduce)->Unit(kMicrosecond)->DenseRange(12, 29); +BENCHMARK(projective_point_addition)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(projective_point_accidental_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(projective_point_doubling)->Unit(kMicrosecond)->DenseRange(12, 22); +BENCHMARK(scalar_multiplication)->Unit(kMicrosecond)->DenseRange(12, 18); +BENCHMARK(cycle_waste)->Unit(kMicrosecond)->DenseRange(20, 30); +BENCHMARK(sequential_copy)->Unit(kMicrosecond)->DenseRange(20, 25); +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py new file mode 100644 index 00000000000..3f039a21860 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py @@ -0,0 +1,46 @@ +#!/usr/bin/python3 +""" +Tool for analyzing a single benchmark file generated by basics_bench. Also used by "analyse_all_benchmarks.py" +For example, in src directory: +./bin/basics --benchmark_filter="par.*" --benchmark_out=parallel_for.csv --benchmark_out_format=csv +python3 ../src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py -f parallel_for.csv +""" +import numpy as np +import argparse +from io import StringIO + +def evaluate_benchmark_from_file(filename): + """ + Take a benchmark file, remove waste and calculate the linear factor + """ + lines=[] + header_found=False + x_exponents=[] + # Google benchmarks have a few extra lines at the start, so we need to skip them + with open (filename) as f: + for line in f: + if line.find("name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message")!=-1: + header_found=True + lines.append(line) + continue + if header_found: + lines.append(line) + x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1])) + + data=np.genfromtxt(StringIO('\n'.join(lines)),delimiter=",",usemask=True) + # Calculate the linear factor + y=np.transpose(data[1:])[2] + x=np.array([1< class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&G_vec_local, srs_elements](size_t start, size_t end) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); std::vector b_vec(poly_degree); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&b_vec, &opening_pair](size_t start, size_t end) { Fr b_power = opening_pair.challenge.pow(start); @@ -75,7 +81,8 @@ template class IPA { b_power *= opening_pair.challenge; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/1); // Iterate for log(poly_degree) rounds to compute the round commitments. auto log_poly_degree = static_cast(numeric::get_msb(poly_degree)); @@ -91,7 +98,7 @@ template class IPA { Fr inner_prod_L = Fr::zero(); Fr inner_prod_R = Fr::zero(); // Run scalar product in parallel - run_loop_in_parallel( + run_loop_in_parallel_if_effective( round_size, [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) { Fr current_inner_prod_L = Fr::zero(); @@ -105,7 +112,8 @@ template class IPA { inner_prod_R += current_inner_prod_R; addition_lock.unlock(); }, - /*no_multhreading_if_less_or_equal=*/8); + /*finite_field_additions_per_iteration=*/2, + /*finite_field_multiplications_per_iteration=*/2); // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( @@ -137,20 +145,20 @@ template class IPA { // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge - run_loop_in_parallel( + run_loop_in_parallel_if_effective( round_size, - [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size]( - size_t start, size_t end) { + [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) { for (size_t j = start; j < end; j++) { a_vec[j] *= round_challenge; a_vec[j] += round_challenge_inv * a_vec[round_size + j]; b_vec[j] *= round_challenge_inv; b_vec[j] += round_challenge * b_vec[round_size + j]; - - G_vec_local[j] = G_lo[j] + G_hi[j]; } }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/4, + /*finite_field_multiplications_per_iteration=*/8, + /*finite_field_inversions_per_iteration=*/1); + GroupElement::batch_affine_add(G_lo, G_hi, G_vec_local); } transcript->send_to_verifier("IPA:a_0", a_vec[0]); @@ -218,7 +226,7 @@ template class IPA { // Compute G_zero // First construct s_vec std::vector s_vec(poly_degree); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { for (size_t i = start; i < end; i++) { @@ -235,7 +243,8 @@ template class IPA { s_vec[i] = s_vec_scalar; } }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/log_poly_degree); auto srs_elements = vk->srs->get_monomial_points(); @@ -245,14 +254,20 @@ template class IPA { // The SRS stored in the commitment key is the result after applying the pippenger point table so the // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism // G_vec_local should use only the original SRS thus we extract only the even indices. - run_loop_in_parallel( + run_loop_in_parallel_if_effective( poly_degree, [&G_vec_local, srs_elements](size_t start, size_t end) { for (size_t i = start * 2; i < end * 2; i += 2) { G_vec_local[i >> 1] = srs_elements[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state); diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index 232c30be69d..b90a44ae146 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -127,3 +127,80 @@ void run_loop_in_parallel(size_t num_points, func(start, end); }); }; + +/** + * @brief Split a loop into several loops running in parallel based on operations in 1 iteration + * + * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function + * that should contain the work loop, but only if it's worth it + * @param num_points Total number of elements + * @param func A function or lambda expression with a for loop inside, for example: + * [](size_t start, size_t end){for (size_t i=start; i& func, + size_t finite_field_additions_per_iteration, + size_t finite_field_multiplications_per_iteration, + size_t finite_field_inversions_per_iteration, + size_t group_element_additions_per_iteration, + size_t group_element_doublings_per_iteration, + size_t scalar_multiplications_per_iteration, + size_t sequential_copy_ops_per_iteration) +{ + // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds): + constexpr size_t FF_ADDITION_COST = 4; + constexpr size_t FF_MULTIPLICATION_COST = 21; + constexpr size_t FF_INVERSION_COST = 7000; + constexpr size_t GE_ADDITION_COST = 350; + constexpr size_t GE_DOUBLING_COST = 194; + constexpr size_t SM_COST = 50000; + constexpr size_t SEQ_COPY_COST = 3; + // We take the maximum observed parallel_for cost (388 us) and round it up. + // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we + // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium + // workloads + constexpr size_t PARALLEL_FOR_COST = 400000; + // Get number of cpus we can split into + const size_t num_cpus = get_num_cpus(); + + // Compute the size of a single chunk + const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1); + + // Compute the cost of all operations done by other threads + const size_t offset_cost = + (num_points - chunk_size) * + (finite_field_additions_per_iteration * FF_ADDITION_COST + + finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST + + finite_field_inversions_per_iteration * FF_INVERSION_COST + + group_element_additions_per_iteration * GE_ADDITION_COST + + group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST + + sequential_copy_ops_per_iteration * SEQ_COPY_COST); + + // If starting parallel for is longer than computing, just compute + if (offset_cost < PARALLEL_FOR_COST) { + func(0, num_points); + return; + } + // Parallelize over chunks + parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) { + // If num_points is small, sometimes we need fewer CPUs + if (chunk_size * chunk_index > num_points) { + return; + } + // Compute the current chunk size (can differ in case it's the last chunk) + size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size); + if (current_chunk_size == 0) { + return; + } + size_t start = chunk_index * chunk_size; + size_t end = chunk_index * chunk_size + current_chunk_size; + func(start, end); + }); +}; \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp index fe799e1d46f..787f0313d9e 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.hpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp @@ -25,4 +25,13 @@ inline size_t get_num_cpus_pow2() void parallel_for(size_t num_iterations, const std::function& func); void run_loop_in_parallel(size_t num_points, const std::function& func, - size_t no_multhreading_if_less_or_equal = 0); \ No newline at end of file + size_t no_multhreading_if_less_or_equal = 0); +void run_loop_in_parallel_if_effective(size_t num_points, + const std::function& func, + size_t finite_field_additions_per_iteration = 0, + size_t finite_field_multiplications_per_iteration = 0, + size_t finite_field_inversions_per_iteration = 0, + size_t group_element_additions_per_iteration = 0, + size_t group_element_doublings_per_iteration = 0, + size_t scalar_multiplications_per_iteration = 0, + size_t sequential_copy_ops_per_iteration = 0); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 420b4649856..22477f8352e 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -91,6 +91,9 @@ template class alignas(32) element { BBERG_INLINE constexpr bool operator==(const element& other) const noexcept; static void batch_normalize(element* elements, size_t num_elements) noexcept; + static void batch_affine_add(const std::span>& first_group, + const std::span>& second_group, + const std::span>& results) noexcept; static std::vector> batch_mul_with_endomorphism( const std::span>& points, const Fr& exponent) noexcept; diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index da98fb74f8f..a679fa6152c 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -688,6 +688,89 @@ element element::mul_with_endomorphism(const Fr& exponent) return work_element; } +/** + * @brief Pairwise affine add points in first and second group + * + * @param first_group + * @param second_group + * @param results + */ +template +void element::batch_affine_add(const std::span>& first_group, + const std::span>& second_group, + const std::span>& results) noexcept +{ + typedef affine_element affine_element; + const size_t num_points = first_group.size(); + ASSERT(second_group.size() == first_group.size()); + + // Space for temporary values + std::vector scratch_space(num_points); + + run_loop_in_parallel_if_effective( + num_points, + [&results, &first_group](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + results[i] = first_group[i]; + } + }, + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/2); + + // TODO(#826): Same code as in batch mul + // we can mutate rhs but NOT lhs! + // output is stored in rhs + /** + * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion + * + */ + const auto batch_affine_add_chunked = + [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) { + Fq batch_inversion_accumulator = Fq::one(); + + for (size_t i = 0; i < point_count; i += 1) { + personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1 + rhs[i].x -= lhs[i].x; // x2 - x1 + rhs[i].y -= lhs[i].y; // y2 - y1 + rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old + batch_inversion_accumulator *= (rhs[i].x); + } + batch_inversion_accumulator = batch_inversion_accumulator.invert(); + + for (size_t i = (point_count)-1; i < point_count; i -= 1) { + rhs[i].y *= batch_inversion_accumulator; // update accumulator + batch_inversion_accumulator *= rhs[i].x; + rhs[i].x = rhs[i].y.sqr(); + rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2 + // - x1 + personal_scratch_space[i] = lhs[i].x - rhs[i].x; + personal_scratch_space[i] *= rhs[i].y; + rhs[i].y = personal_scratch_space[i] - lhs[i].y; + } + }; + + /** + * @brief Perform batch affine addition in parallel + * + */ + const auto batch_affine_add_internal = + [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { + run_loop_in_parallel_if_effective( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*finite_field_additions_per_iteration=*/6, + /*finite_field_multiplications_per_iteration=*/6); + }; + batch_affine_add_internal(&second_group[0], &results[0]); +} + /** * @brief Multiply each point by the same exponent * @@ -708,8 +791,9 @@ std::vector> element::batch_mul_with_endomo // Space for temporary values std::vector scratch_space(num_points); - // we can mutate rhs but NOT lhs! - // output is stored in rhs + // TODO(#826): Same code as in batch add + // we can mutate rhs but NOT lhs! + // output is stored in rhs /** * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion * @@ -743,15 +827,16 @@ std::vector> element::batch_mul_with_endomo * @brief Perform batch affine addition in parallel * */ - const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, - affine_element* rhs) { - run_loop_in_parallel( - num_points, - [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { - batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); - }, - /*no_multhreading_if_less_or_equal=*/4); - }; + const auto batch_affine_add_internal = + [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) { + run_loop_in_parallel_if_effective( + num_points, + [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) { + batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start); + }, + /*finite_field_additions_per_iteration=*/6, + /*finite_field_multiplications_per_iteration=*/6); + }; /** * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion @@ -789,12 +874,13 @@ std::vector> element::batch_mul_with_endomo * */ const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) { batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start); }, - /*no_multhreading_if_less_or_equal=*/4); + /*finite_field_additions_per_iteration=*/7, + /*finite_field_multiplications_per_iteration=*/6); }; // Compute wnaf for scalar const Fr converted_scalar = exponent.from_montgomery_form(); @@ -804,14 +890,20 @@ std::vector> element::batch_mul_with_endomo affine_element result{ Fq::zero(), Fq::zero() }; result.self_set_infinity(); std::vector results(num_points); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&results, result](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { results[i] = result; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); return results; } @@ -824,7 +916,7 @@ std::vector> element::batch_mul_with_endomo } // Initialize first etnries in lookup table std::vector temp_point_vector(num_points); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -832,20 +924,32 @@ std::vector> element::batch_mul_with_endomo lookup_table[0][i] = points[i]; } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/2); // Construct lookup table batch_affine_double(&temp_point_vector[0]); for (size_t j = 1; j < lookup_size; ++j) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [j, &lookup_table](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { lookup_table[j][i] = lookup_table[j - 1][i]; } }, - /*no_multhreading_if_less_or_equal=*/16); - batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); + batch_affine_add_internal(&temp_point_vector[0], &lookup_table[j][0]); } uint64_t wnaf_table[num_rounds * 2]; @@ -873,7 +977,7 @@ std::vector> element::batch_mul_with_endomo index = wnaf_entry & 0x0fffffffU; sign = static_cast((wnaf_entry >> 31) & 1); const bool is_odd = ((j & 1) == 1); - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start, size_t end) { @@ -891,10 +995,16 @@ std::vector> element::batch_mul_with_endomo } } }, - /*no_multhreading_if_less_or_equal=*/16); + /*finite_field_additions_per_iteration=*/1, + /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); } // First cycle of addition - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); // Run through SM logic in wnaf form (excluding the skew) for (size_t j = 2; j < num_rounds * 2; ++j) { wnaf_entry = wnaf_table[j]; @@ -906,7 +1016,7 @@ std::vector> element::batch_mul_with_endomo batch_affine_double(&work_elements[0]); } } - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -919,13 +1029,19 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i] = to_add; } }, - /*no_multhreading_if_less_or_equal=*/16); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + /*finite_field_additions_per_iteration=*/1, + /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the first endo scalar if (skew) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [&lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -933,12 +1049,18 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i] = -lookup_table[0][i]; } }, - /*no_multhreading_if_less_or_equal=*/16); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/0, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } // Apply skew for the second endo scalar if (endo_skew) { - run_loop_in_parallel( + run_loop_in_parallel_if_effective( num_points, [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) { for (size_t i = start; i < end; ++i) { @@ -947,8 +1069,14 @@ std::vector> element::batch_mul_with_endomo temp_point_vector[i].x *= beta; } }, - /*no_multhreading_if_less_or_equal=*/16); - batch_affine_add(&temp_point_vector[0], &work_elements[0]); + /*finite_field_additions_per_iteration=*/0, + /*finite_field_multiplications_per_iteration=*/1, + /*finite_field_inversions_per_iteration=*/0, + /*group_element_additions_per_iteration=*/0, + /*group_element_doublings_per_iteration=*/0, + /*scalar_multiplications_per_iteration=*/0, + /*sequential_copy_ops_per_iteration=*/1); + batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]); } return work_elements;