From ba335bdff645398d20241ce7baab02f63b20f55c Mon Sep 17 00:00:00 2001 From: maramihali Date: Fri, 29 Nov 2024 13:31:09 +0000 Subject: [PATCH] chore: Parallelise construction of perturbator coefficients at each level (#10304) When constructing the perturbator via the tree technique we can parallelise the construction of the coefficients at each level which is one of the culprits of performance degrading when using an ambient trace of size 2^20. We see a ~1s improvement in performance doing this. 2^20 trace (`EXAMPLE_20`) in master for `ClientIVCBench/Full/6` was `38114 ms` and is now `37310 ms`. For the defacto `CLIENT_IVC_BENCH_STRUCTURE` which gives 2^19 finalised circuits we go from `29496ms` to `29188 ms` so not as impactful but also the performance doesn't degrade. I have also benchmarked the actual computation of coefficients and it is neglegible regardless of the ambient trace size (2^20 vs 2^19) . --- .../protogalaxy_prover_internal.hpp | 90 +++++++++++++------ 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp index efad2d0e072..c635da8cfa2 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_internal.hpp @@ -161,6 +161,36 @@ template class ProtogalaxyProverInternal { return aggregated_relation_evaluations; } + + /** + * @brief Initialise the data structured storing a set of nodes at a given level, in parallel if the width is + * sufficiently big + * + * @param level_width determines the number of nodes for the given level + * @param degree determines the degree of the polynomial stored in each node, the number of elements will be + * degree+1 + * + * @return std::vector> + */ + static std::vector> initialise_coefficient_tree_level(const size_t level_width, const size_t degree) + { + PROFILE_THIS_NAME("initialise coefficient tree level"); + std::vector> level_coeffs(level_width); + const size_t num_threads = calculate_num_threads(level_width); + const size_t range_per_thread = level_width / num_threads; + const size_t leftovers = level_width - (range_per_thread * num_threads); + parallel_for(num_threads, [&](size_t j) { + const size_t offset = j * range_per_thread; + const size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread; + ASSERT(offset < level_width || level_width == 0); + ASSERT((offset + range) <= level_width); + for (size_t idx = offset; idx < offset + range; idx++) { + // Representing a polynomial of a certain degree requires degree + 1 coefficients + level_coeffs[idx].resize(degree + 1); + } + }); + return level_coeffs; + } /** * @brief Recursively compute the parent nodes of each level in the tree, starting from the leaves. Note that at * each level, the resulting parent nodes will be polynomials of degree (level+1) because we multiply by an @@ -171,24 +201,28 @@ template class ProtogalaxyProverInternal { const std::vector>& prev_level_coeffs, size_t level = 1) { + if (level == betas.size()) { return prev_level_coeffs[0]; } - - auto degree = level + 1; - auto prev_level_width = prev_level_coeffs.size(); - std::vector> level_coeffs(prev_level_width / 2, std::vector(degree + 1, 0)); - parallel_for_heuristic( - prev_level_width / 2, - [&](size_t parent) { - size_t node = parent * 2; - std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); - for (size_t d = 0; d < degree; d++) { - level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; - level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; - } - }, - /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3); + const size_t degree = level + 1; + const size_t level_width = prev_level_coeffs.size() / 2; + std::vector> level_coeffs = initialise_coefficient_tree_level(level_width, degree); + { + PROFILE_THIS_NAME("other coefficients tree computation"); + parallel_for_heuristic( + level_width, + [&](size_t parent) { + size_t node = parent * 2; + std::copy( + prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin()); + for (size_t d = 0; d < degree; d++) { + level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level]; + level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level]; + } + }, + /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3); + } return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1); } @@ -206,17 +240,21 @@ template class ProtogalaxyProverInternal { std::span deltas, const std::vector& full_honk_evaluations) { - auto width = full_honk_evaluations.size(); - std::vector> first_level_coeffs(width / 2, std::vector(2, 0)); - parallel_for_heuristic( - width / 2, - [&](size_t parent) { - size_t node = parent * 2; - first_level_coeffs[parent][0] = - full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; - first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; - }, - /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3); + + const size_t width = full_honk_evaluations.size() / 2; + std::vector> first_level_coeffs = initialise_coefficient_tree_level(width, 1); + { + PROFILE_THIS_NAME("perturbator coefficients first level computation"); + parallel_for_heuristic( + width, + [&](size_t parent) { + const size_t node = parent * 2; + first_level_coeffs[parent][0] = + full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0]; + first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0]; + }, + /* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3); + } return construct_coefficients_tree(betas, deltas, first_level_coeffs); }