Skip to content

Commit

Permalink
feat: parallelization update for polynomials (#2311)
Browse files Browse the repository at this point in the history
I added parallelization update to polynomials, specifically to the
`add_scaled`, `operator+=`, `operator-=`, and `operator*=` functions.
This was done using the parallel_for function, replacing the simple for
loops. This update has sped up the Gemini component without queue
processing (just `execute_univariatization_round()`) by roughly 5-10x.
In particular, with a circuit of 2^19 gates, this component's runtime
changes from ~520ms to ~65ms. Overall, this change leads to ~20%
improvements in Honk for larger circuit sizes. Note that these values
are noisy because of mainframe load. Another side note is that the
Shplonk component seems to be slower with these changes for small
circuit sizes (2^13 or so), and this needs further investigation. For
larger circuit sizes, we observe a ~2x speedup in the Shplonk component.

More benchmarking details
[here](https://docs.google.com/spreadsheets/d/1mFquxU0nbigLwmttcfxC2Yc9J8-TbjLKOHeRMtY7JfM/).

I also created new thread_utils file for standardization purposes, which
includes the `calculate_num_threads`, and `calculate_num_threads_pow2`
functions. These functions serve to calculate the number of threads to
split up the work based on the minimum number of iterations per thread
and the total amount of iterations.

---------

Co-authored-by: codygunton <[email protected]>
  • Loading branch information
lucasxia01 and codygunton authored Sep 23, 2023
1 parent 882682b commit 922fc99
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 28 deletions.
40 changes: 40 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "thread_utils.hpp"

namespace barretenberg::thread_utils {
/**
* @brief calculates number of threads to create based on minimum iterations per thread
* @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
* Returns the min of `desired_num_threads` and `max_num_threads`.
* Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
*
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
{
size_t max_num_threads = get_num_cpus(); // number of available threads
size_t desired_num_threads = num_iterations / min_iterations_per_thread;
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
return num_threads;
}

/**
* @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
* @details Same functionality as `calculate_num_threads` but guaranteed power of 2
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
{
size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
size_t desired_num_threads = num_iterations / min_iterations_per_thread;
desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num_threads is at least 1
return num_threads;
}

} // namespace barretenberg::thread_utils
29 changes: 29 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/thread_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "thread.hpp"

namespace barretenberg::thread_utils {

const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;

/**
* @brief calculates number of threads to create based on minimum iterations per thread
* @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
* Returns the min of `desired_num_threads` and `max_num_theads`.
* Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
*
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);

/**
* @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
* @details Same functionality as `calculate_num_threads` but guaranteed power of 2
* @param num_iterations
* @param min_iterations_per_thread
* @return size_t
*/
size_t calculate_num_threads_pow2(size_t num_iterations,
size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);

} // namespace barretenberg::thread_utils
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "barretenberg/common/log.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/common/thread_utils.hpp"
#include "barretenberg/polynomials/barycentric.hpp"
#include "barretenberg/polynomials/pow.hpp"
#include "barretenberg/proof_system/flavor/flavor.hpp"
Expand Down Expand Up @@ -140,12 +141,10 @@ template <typename Flavor> class SumcheckProverRound {
// Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
// on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
// For now we use a power of 2 number of threads simply to ensure the round size is evenly divided.
size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
size_t desired_num_threads = round_size / min_iterations_per_thread;
size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1
size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread
size_t num_threads =
barretenberg::thread_utils::calculate_num_threads_pow2(round_size, min_iterations_per_thread);
size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread

// Constuct univariate accumulator containers; one per thread
std::vector<RelationUnivariates> thread_univariate_accumulators(num_threads);
Expand Down
67 changes: 45 additions & 22 deletions barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "polynomial.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/slab_allocator.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/common/thread_utils.hpp"
#include "polynomial_arithmetic.hpp"
#include <cstddef>
#include <fcntl.h>
Expand Down Expand Up @@ -306,25 +308,34 @@ template <typename Fr> void Polynomial<Fr>::add_scaled(std::span<const Fr> other
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] += scaling_factor * other[i];
}
// Calculates number of threads with thread_utils::calculate_num_threads
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] += scaling_factor * other[i];
}
});
}

template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator+=(std::span<const Fr> other)
{
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] += other[i];
}
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] += other[i];
}
});

return *this;
}
Expand All @@ -334,23 +345,35 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(std::span<cons
const size_t other_size = other.size();
ASSERT(in_place_operation_viable(other_size));

/** TODO parallelize using some kind of generic evaluation domain
* we really only need to know the thread size, but we don't need all the FFT roots
*/
for (size_t i = 0; i < other_size; ++i) {
coefficients_.get()[i] -= other[i];
}
size_t num_threads = thread_utils::calculate_num_threads(other_size);
size_t range_per_thread = other_size / num_threads;
size_t leftovers = other_size - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] -= other[i];
}
});

return *this;
}

template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_facor)
template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_factor)
{
ASSERT(in_place_operation_viable());

for (size_t i = 0; i < size_; ++i) {
coefficients_.get()[i] *= scaling_facor;
}
size_t num_threads = thread_utils::calculate_num_threads(size_);
size_t range_per_thread = size_ / num_threads;
size_t leftovers = size_ - (range_per_thread * num_threads);
parallel_for(num_threads, [&](size_t j) {
size_t offset = j * range_per_thread;
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
for (size_t i = offset; i < end; ++i) {
coefficients_.get()[i] *= scaling_factor;
}
});

return *this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ template <typename Fr> class Polynomial {
*
* @param scaling_factor s
*/
Polynomial& operator*=(const Fr scaling_facor);
Polynomial& operator*=(const Fr scaling_factor);

/**
* @brief evaluates p(X) = ∑ᵢ aᵢ⋅Xⁱ considered as multi-linear extension p(X₀,…,Xₘ₋₁) = ∑ᵢ aᵢ⋅Lᵢ(X₀,…,Xₘ₋₁)
Expand Down

0 comments on commit 922fc99

Please sign in to comment.