feat: parallelization update for polynomials (#2311)

I added parallelization update to polynomials, specifically to the `add_scaled`, `operator+=`, `operator-=`, and `operator*=` functions. This was done using the parallel_for function, replacing the simple for loops. This update has sped up the Gemini component without queue processing (just `execute_univariatization_round()`) by roughly 5-10x. In particular, with a circuit of 2^19 gates, this component's runtime changes from ~520ms to ~65ms. Overall, this change leads to ~20% improvements in Honk for larger circuit sizes. Note that these values are noisy because of mainframe load. Another side note is that the Shplonk component seems to be slower with these changes for small circuit sizes (2^13 or so), and this needs further investigation. For larger circuit sizes, we observe a ~2x speedup in the Shplonk component. More benchmarking details [here](https://docs.google.com/spreadsheets/d/1mFquxU0nbigLwmttcfxC2Yc9J8-TbjLKOHeRMtY7JfM/). I also created new thread_utils file for standardization purposes, which includes the `calculate_num_threads`, and `calculate_num_threads_pow2` functions. These functions serve to calculate the number of threads to split up the work based on the minimum number of iterations per thread and the total amount of iterations. --------- Co-authored-by: codygunton <[email protected]>
AztecProtocol · Sep 23, 2023 · 922fc99 · 922fc99
1 parent 882682b
commit 922fc99
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 28 deletions.
diff --git a/barretenberg/cpp/src/barretenberg/common/thread_utils.cpp b/barretenberg/cpp/src/barretenberg/common/thread_utils.cpp
@@ -0,0 +1,40 @@
+#include "thread_utils.hpp"
+
+namespace barretenberg::thread_utils {
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread
+ * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
+ * Returns the min of `desired_num_threads` and `max_num_threads`.
+ * Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
+ *
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread)
+{
+    size_t max_num_threads = get_num_cpus(); // number of available threads
+    size_t desired_num_threads = num_iterations / min_iterations_per_thread;
+    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
+    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1
+    return num_threads;
+}
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
+ * @details Same functionality as `calculate_num_threads` but guaranteed power of 2
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads_pow2(size_t num_iterations, size_t min_iterations_per_thread)
+{
+    size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
+    size_t desired_num_threads = num_iterations / min_iterations_per_thread;
+    desired_num_threads = static_cast<size_t>(1ULL << numeric::get_msb(desired_num_threads));
+    size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
+    num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num_threads is at least 1
+    return num_threads;
+}
+
+} // namespace barretenberg::thread_utils
diff --git a/barretenberg/cpp/src/barretenberg/common/thread_utils.hpp b/barretenberg/cpp/src/barretenberg/common/thread_utils.hpp
@@ -0,0 +1,29 @@
+#include "thread.hpp"
+
+namespace barretenberg::thread_utils {
+
+const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread
+ * @details Finds the number of cpus with get_num_cpus(), and calculates `desired_num_threads`
+ * Returns the min of `desired_num_threads` and `max_num_theads`.
+ * Note that it will not calculate a power of 2 necessarily, use `calculate_num_threads_pow2` instead
+ *
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
+
+/**
+ * @brief calculates number of threads to create based on minimum iterations per thread, guaranteed power of 2
+ * @details Same functionality as `calculate_num_threads` but guaranteed power of 2
+ * @param num_iterations
+ * @param min_iterations_per_thread
+ * @return size_t
+ */
+size_t calculate_num_threads_pow2(size_t num_iterations,
+                                  size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
+
+} // namespace barretenberg::thread_utils
diff --git a/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/barretenberg/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "barretenberg/common/log.hpp"
 #include "barretenberg/common/thread.hpp"
+#include "barretenberg/common/thread_utils.hpp"
 #include "barretenberg/polynomials/barycentric.hpp"
 #include "barretenberg/polynomials/pow.hpp"
 #include "barretenberg/proof_system/flavor/flavor.hpp"
@@ -140,12 +141,10 @@ template <typename Flavor> class SumcheckProverRound {
         // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
         // on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
         // For now we use a power of 2 number of threads simply to ensure the round size is evenly divided.
-        size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
         size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
-        size_t desired_num_threads = round_size / min_iterations_per_thread;
-        size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
-        num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num threads is >= 1
-        size_t iterations_per_thread = round_size / num_threads;             // actual iterations per thread
+        size_t num_threads =
+            barretenberg::thread_utils::calculate_num_threads_pow2(round_size, min_iterations_per_thread);
+        size_t iterations_per_thread = round_size / num_threads; // actual iterations per thread
 
         // Constuct univariate accumulator containers; one per thread
         std::vector<RelationUnivariates> thread_univariate_accumulators(num_threads);

diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.cpp
@@ -1,6 +1,8 @@
 #include "polynomial.hpp"
 #include "barretenberg/common/assert.hpp"
 #include "barretenberg/common/slab_allocator.hpp"
+#include "barretenberg/common/thread.hpp"
+#include "barretenberg/common/thread_utils.hpp"
 #include "polynomial_arithmetic.hpp"
 #include <cstddef>
 #include <fcntl.h>
@@ -306,25 +308,34 @@ template <typename Fr> void Polynomial<Fr>::add_scaled(std::span<const Fr> other
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] += scaling_factor * other[i];
-    }
+    // Calculates number of threads with thread_utils::calculate_num_threads
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] += scaling_factor * other[i];
+        }
+    });
 }
 
 template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator+=(std::span<const Fr> other)
 {
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] += other[i];
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] += other[i];
+        }
+    });
 
     return *this;
 }
@@ -334,23 +345,35 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(std::span<cons
     const size_t other_size = other.size();
     ASSERT(in_place_operation_viable(other_size));
 
-    /** TODO parallelize using some kind of generic evaluation domain
-     *  we really only need to know the thread size, but we don't need all the FFT roots
-     */
-    for (size_t i = 0; i < other_size; ++i) {
-        coefficients_.get()[i] -= other[i];
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(other_size);
+    size_t range_per_thread = other_size / num_threads;
+    size_t leftovers = other_size - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] -= other[i];
+        }
+    });
 
     return *this;
 }
 
-template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_facor)
+template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_factor)
 {
     ASSERT(in_place_operation_viable());
 
-    for (size_t i = 0; i < size_; ++i) {
-        coefficients_.get()[i] *= scaling_facor;
-    }
+    size_t num_threads = thread_utils::calculate_num_threads(size_);
+    size_t range_per_thread = size_ / num_threads;
+    size_t leftovers = size_ - (range_per_thread * num_threads);
+    parallel_for(num_threads, [&](size_t j) {
+        size_t offset = j * range_per_thread;
+        size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread;
+        for (size_t i = offset; i < end; ++i) {
+            coefficients_.get()[i] *= scaling_factor;
+        }
+    });
+
     return *this;
 }
 

diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial.hpp
@@ -188,7 +188,7 @@ template <typename Fr> class Polynomial {
      *
      * @param scaling_factor s
      */
-    Polynomial& operator*=(const Fr scaling_facor);
+    Polynomial& operator*=(const Fr scaling_factor);
 
     /**
      * @brief evaluates p(X) = ∑ᵢ aᵢ⋅Xⁱ considered as multi-linear extension p(X₀,…,Xₘ₋₁) = ∑ᵢ aᵢ⋅Lᵢ(X₀,…,Xₘ₋₁)