Skip to content

Commit

Permalink
chore: Parallelise construction of perturbator coefficients at each l…
Browse files Browse the repository at this point in the history
…evel (#10304)

When constructing the perturbator via the tree technique we can
parallelise the construction of the coefficients at each level which is
one of the culprits of performance degrading when using an ambient trace
of size 2^20. We see a ~1s improvement in performance doing this.

2^20 trace (`EXAMPLE_20`) in master for `ClientIVCBench/Full/6` was
`38114 ms` and is now `37310 ms`. For the defacto
`CLIENT_IVC_BENCH_STRUCTURE` which gives 2^19 finalised circuits we go
from `29496ms` to `29188 ms` so not as impactful but also the
performance doesn't degrade.

I have also benchmarked the actual computation of coefficients and it is
neglegible regardless of the ambient trace size (2^20 vs 2^19) .
  • Loading branch information
maramihali authored Nov 29, 2024
1 parent 0311bf3 commit ba335bd
Showing 1 changed file with 64 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,36 @@ template <class DeciderProvingKeys_> class ProtogalaxyProverInternal {

return aggregated_relation_evaluations;
}

/**
* @brief Initialise the data structured storing a set of nodes at a given level, in parallel if the width is
* sufficiently big
*
* @param level_width determines the number of nodes for the given level
* @param degree determines the degree of the polynomial stored in each node, the number of elements will be
* degree+1
*
* @return std::vector<std::vector<FF>>
*/
static std::vector<std::vector<FF>> initialise_coefficient_tree_level(const size_t level_width, const size_t degree)
{
PROFILE_THIS_NAME("initialise coefficient tree level");
std::vector<std::vector<FF>> level_coeffs(level_width);
const size_t num_threads = calculate_num_threads(level_width);
const size_t range_per_thread = level_width / num_threads;
const size_t leftovers = level_width - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
const size_t offset = j * range_per_thread;
const size_t range = (j == num_threads - 1) ? range_per_thread + leftovers : range_per_thread;
ASSERT(offset < level_width || level_width == 0);
ASSERT((offset + range) <= level_width);
for (size_t idx = offset; idx < offset + range; idx++) {
// Representing a polynomial of a certain degree requires degree + 1 coefficients
level_coeffs[idx].resize(degree + 1);
}
});
return level_coeffs;
}
/**
* @brief Recursively compute the parent nodes of each level in the tree, starting from the leaves. Note that at
* each level, the resulting parent nodes will be polynomials of degree (level+1) because we multiply by an
Expand All @@ -171,24 +201,28 @@ template <class DeciderProvingKeys_> class ProtogalaxyProverInternal {
const std::vector<std::vector<FF>>& prev_level_coeffs,
size_t level = 1)
{

if (level == betas.size()) {
return prev_level_coeffs[0];
}

auto degree = level + 1;
auto prev_level_width = prev_level_coeffs.size();
std::vector<std::vector<FF>> level_coeffs(prev_level_width / 2, std::vector<FF>(degree + 1, 0));
parallel_for_heuristic(
prev_level_width / 2,
[&](size_t parent) {
size_t node = parent * 2;
std::copy(prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3);
const size_t degree = level + 1;
const size_t level_width = prev_level_coeffs.size() / 2;
std::vector<std::vector<FF>> level_coeffs = initialise_coefficient_tree_level(level_width, degree);
{
PROFILE_THIS_NAME("other coefficients tree computation");
parallel_for_heuristic(
level_width,
[&](size_t parent) {
size_t node = parent * 2;
std::copy(
prev_level_coeffs[node].begin(), prev_level_coeffs[node].end(), level_coeffs[parent].begin());
for (size_t d = 0; d < degree; d++) {
level_coeffs[parent][d] += prev_level_coeffs[node + 1][d] * betas[level];
level_coeffs[parent][d + 1] += prev_level_coeffs[node + 1][d] * deltas[level];
}
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * degree * 3);
}
return construct_coefficients_tree(betas, deltas, level_coeffs, level + 1);
}

Expand All @@ -206,17 +240,21 @@ template <class DeciderProvingKeys_> class ProtogalaxyProverInternal {
std::span<const FF> deltas,
const std::vector<FF>& full_honk_evaluations)
{
auto width = full_honk_evaluations.size();
std::vector<std::vector<FF>> first_level_coeffs(width / 2, std::vector<FF>(2, 0));
parallel_for_heuristic(
width / 2,
[&](size_t parent) {
size_t node = parent * 2;
first_level_coeffs[parent][0] =
full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3);

const size_t width = full_honk_evaluations.size() / 2;
std::vector<std::vector<FF>> first_level_coeffs = initialise_coefficient_tree_level(width, 1);
{
PROFILE_THIS_NAME("perturbator coefficients first level computation");
parallel_for_heuristic(
width,
[&](size_t parent) {
const size_t node = parent * 2;
first_level_coeffs[parent][0] =
full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
first_level_coeffs[parent][1] = full_honk_evaluations[node + 1] * deltas[0];
},
/* overestimate */ thread_heuristics::FF_MULTIPLICATION_COST * 3);
}
return construct_coefficients_tree(betas, deltas, first_level_coeffs);
}

Expand Down

1 comment on commit ba335bd

@AztecBot
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'C++ Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.05.

Benchmark suite Current: ba335bd Previous: 0311bf3 Ratio
wasmClientIVCBench/Full/6 95384.399199 ms/iter 87862.19409100001 ms/iter 1.09

This comment was automatically generated by workflow using github-action-benchmark.

CC: @ludamad @codygunton

Please sign in to comment.