From 6952636a03825c97d671b1ac866c75461a09ba9d Mon Sep 17 00:00:00 2001
From: Innokentii Sennovskii <isennovskiy@gmail.com>
Date: Thu, 11 Jan 2024 17:41:28 +0000
Subject: [PATCH] feat: Parallel IPA (#3882)

This PR:
1. Adds a new function run_loop_in_parallel to thread.hpp so that it's
easier to use parallelism in most cases without redoing the computation
for splitting the workload into chunks every time.
2. Uses this new functionality to parallelise logic in IPA open and
verify procedures (and used methods) (x20-x30 improvement for ECCVM
proving)
3. Fixes an error in using one of the vectors in IPA opening procedure
which led to an additional nlogn complexity.
4. Adds an IPA opening and verification benchmark
---
 cpp/src/barretenberg/benchmark/CMakeLists.txt |   1 +
 .../benchmark/ipa_bench/CMakeLists.txt        |  18 ++
 .../benchmark/ipa_bench/ipa.bench.cpp         |  73 +++++
 .../commitment_schemes/ipa/ipa.hpp            | 143 +++++----
 cpp/src/barretenberg/common/thread.cpp        |  41 +++
 cpp/src/barretenberg/common/thread.hpp        |   3 +
 cpp/src/barretenberg/ecc/groups/element.hpp   |   2 +-
 .../barretenberg/ecc/groups/element_impl.hpp  | 282 ++++++++++++------
 8 files changed, 419 insertions(+), 144 deletions(-)
 create mode 100644 cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
 create mode 100644 cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
diff --git a/cpp/src/barretenberg/benchmark/CMakeLists.txt b/cpp/src/barretenberg/benchmark/CMakeLists.txt
index 49bcb251f0..4a13dff546 100644
--- a/cpp/src/barretenberg/benchmark/CMakeLists.txt
+++ b/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(ipa_bench)
 add_subdirectory(decrypt_bench)
 add_subdirectory(pippenger_bench)
 add_subdirectory(plonk_bench)
diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
new file mode 100644
index 0000000000..6e66ce2cf3
--- /dev/null
+++ b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Each source represents a separate benchmark suite 
+set(BENCHMARK_SOURCES
+ ipa.bench.cpp
+)
+
+# Required libraries for benchmark suites
+set(LINKED_LIBRARIES
+  benchmark::benchmark
+  ultra_honk
+)
+
+# Add executable and custom target for each suite, e.g. ultra_honk_bench
+foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
+  get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
+  add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
+  target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
+  add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
\ No newline at end of file
diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
new file mode 100644
index 0000000000..9d4f7028e5
--- /dev/null
+++ b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
@@ -0,0 +1,73 @@
+#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
+#include <benchmark/benchmark.h>
+
+using namespace benchmark;
+using namespace barretenberg;
+using namespace proof_system;
+using namespace proof_system::honk::pcs::ipa;
+namespace {
+using Curve = curve::Grumpkin;
+using Fr = Curve::ScalarField;
+using IPA = IPA<Curve>;
+using OpeningPair = honk::pcs::OpeningPair<Curve>;
+using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
+using Polynomial = Polynomial<Curve::ScalarField>;
+using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
+using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;
+
+constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
+constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
+std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
+    new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));
+
+auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+
+std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
+                                                                      MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+
+void ipa_open(State& state) noexcept
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t n = 1 << static_cast<size_t>(state.range(0));
+        // Construct the polynomial
+        Polynomial poly(n);
+        for (size_t i = 0; i < n; ++i) {
+            poly[i] = Fr::random_element(&engine);
+        }
+        auto x = Fr::random_element(&engine);
+        auto eval = poly.evaluate(x);
+        const OpeningPair opening_pair = { x, eval };
+        const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
+        // initialize empty prover transcript
+        auto prover_transcript = std::make_shared<honk::BaseTranscript>();
+        state.ResumeTiming();
+        // Compute proof
+        IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
+        // Store info for verifier
+        prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
+        opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
+    }
+}
+void ipa_verify(State& state) noexcept
+{
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Retrieve proofs
+        auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        // initialize verifier transcript from proof data
+        auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);
+
+        state.ResumeTiming();
+        auto result = IPA::verify(vk, opening_claim, verifier_transcript);
+        ASSERT(result);
+    }
+}
+} // namespace
+BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index 85cfb13e66..460f5d6dc4 100644
--- a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -52,46 +52,67 @@ template <typename Curve> class IPA {
         auto a_vec = polynomial;
         auto srs_elements = ck->srs->get_monomial_points();
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         std::vector<Fr> b_vec(poly_degree);
-        Fr b_power = 1;
-        for (size_t i = 0; i < poly_degree; i++) {
-            b_vec[i] = b_power;
-            b_power *= opening_pair.challenge;
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&b_vec, &opening_pair](size_t start, size_t end) {
+                Fr b_power = opening_pair.challenge.pow(start);
+                for (size_t i = start; i < end; i++) {
+                    b_vec[i] = b_power;
+                    b_power *= opening_pair.challenge;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
         std::vector<GroupElement> L_elements(log_poly_degree);
         std::vector<GroupElement> R_elements(log_poly_degree);
         std::size_t round_size = poly_degree;
 
-        // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
-        // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
-        // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
-        // loop can also be integrated.
+        // Perform IPA rounds
         for (size_t i = 0; i < log_poly_degree; i++) {
             round_size >>= 1;
             // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
+            std::mutex addition_lock;
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
-            for (size_t j = 0; j < round_size; j++) {
-                inner_prod_L += a_vec[j] * b_vec[round_size + j];
-                inner_prod_R += a_vec[round_size + j] * b_vec[j];
-            }
+            // Run scalar product in parallel
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
+                    Fr current_inner_prod_L = Fr::zero();
+                    Fr current_inner_prod_R = Fr::zero();
+                    for (size_t j = start; j < end; j++) {
+                        current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
+                        current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
+                    }
+                    addition_lock.lock();
+                    inner_prod_L += current_inner_prod_L;
+                    inner_prod_R += current_inner_prod_R;
+                    addition_lock.unlock();
+                },
+                /*no_multhreading_if_less_or_equal=*/8);
+
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
-            L_elements[i] =
-                // TODO(#473)
-                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
-                    &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
+            L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
+                &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
             L_elements[i] += aux_generator * inner_prod_L;
 
             // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
-            // TODO(#473)
             R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
                 &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
             R_elements[i] += aux_generator * inner_prod_R;
@@ -104,23 +125,32 @@ template <typename Curve> class IPA {
             const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
             const Fr round_challenge_inv = round_challenge.invert();
 
-            std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
-            std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
-            G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
-            G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
+            auto G_lo = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                round_challenge_inv);
+            auto G_hi = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
+                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                round_challenge);
 
             // Update the vectors a_vec, b_vec and G_vec.
             // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
             // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
             // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
-            for (size_t j = 0; j < round_size; j++) {
-                a_vec[j] *= round_challenge;
-                a_vec[j] += round_challenge_inv * a_vec[round_size + j];
-                b_vec[j] *= round_challenge_inv;
-                b_vec[j] += round_challenge * b_vec[round_size + j];
-
-                G_vec_local[j] = G_lo[j] + G_hi[j];
-            }
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
+                    size_t start, size_t end) {
+                    for (size_t j = start; j < end; j++) {
+                        a_vec[j] *= round_challenge;
+                        a_vec[j] += round_challenge_inv * a_vec[round_size + j];
+                        b_vec[j] *= round_challenge_inv;
+                        b_vec[j] += round_challenge * b_vec[round_size + j];
+
+                        G_vec_local[j] = G_lo[j] + G_hi[j];
+                    }
+                },
+                /*no_multhreading_if_less_or_equal=*/4);
         }
 
         transcript->send_to_verifier("IPA:a_0", a_vec[0]);
@@ -166,7 +196,7 @@ template <typename Curve> class IPA {
             msm_scalars[2 * i] = round_challenges[i].sqr();
             msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
         }
-        // TODO(#473)
+
         GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
         GroupElement C_zero = C_prime + LR_sums;
@@ -188,29 +218,42 @@ template <typename Curve> class IPA {
         // Compute G_zero
         // First construct s_vec
         std::vector<Fr> s_vec(poly_degree);
-        for (size_t i = 0; i < poly_degree; i++) {
-            Fr s_vec_scalar = Fr::one();
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                auto bit = (i >> j) & 1;
-                bool b = static_cast<bool>(bit);
-                if (b) {
-                    s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
-                } else {
-                    s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+        run_loop_in_parallel(
+            poly_degree,
+            [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    Fr s_vec_scalar = Fr::one();
+                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+                        auto bit = (i >> j) & 1;
+                        bool b = static_cast<bool>(bit);
+                        if (b) {
+                            s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
+                        } else {
+                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+                        }
+                    }
+                    s_vec[i] = s_vec_scalar;
                 }
-            }
-            s_vec[i] = s_vec_scalar;
-        }
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
+
         auto srs_elements = vk->srs->get_monomial_points();
+
         // Copy the G_vector to local memory.
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
-        // TODO(#473)
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
 
diff --git a/cpp/src/barretenberg/common/thread.cpp b/cpp/src/barretenberg/common/thread.cpp
index 6d3007c873..232c30be69 100644
--- a/cpp/src/barretenberg/common/thread.cpp
+++ b/cpp/src/barretenberg/common/thread.cpp
@@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
 #endif
 #endif
 }
+
+/**
+ * @brief Split a loop into several loops running in parallel
+ *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
+ * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
+ *
+ */
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal)
+{
+    if (num_points <= no_multhreading_if_less_or_equal) {
+        func(0, num_points);
+        return;
+    }
+    // Get number of cpus we can split into
+    const size_t num_cpus = get_num_cpus();
+
+    // Compute the size of a single chunk
+    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
+    // Parallelize over chunks
+    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
+        // If num_points is small, sometimes we need fewer CPUs
+        if (chunk_size * chunk_index > num_points) {
+            return;
+        }
+        // Compute the current chunk size (can differ in case it's the last chunk)
+        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
+        if (current_chunk_size == 0) {
+            return;
+        }
+        size_t start = chunk_index * chunk_size;
+        size_t end = chunk_index * chunk_size + current_chunk_size;
+        func(start, end);
+    });
+};
diff --git a/cpp/src/barretenberg/common/thread.hpp b/cpp/src/barretenberg/common/thread.hpp
index 96e3df7409..fe799e1d46 100644
--- a/cpp/src/barretenberg/common/thread.hpp
+++ b/cpp/src/barretenberg/common/thread.hpp
@@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
 }
 
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal = 0);
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/groups/element.hpp b/cpp/src/barretenberg/ecc/groups/element.hpp
index 0b2d13761f..420b464985 100644
--- a/cpp/src/barretenberg/ecc/groups/element.hpp
+++ b/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
 
     static void batch_normalize(element* elements, size_t num_elements) noexcept;
     static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
-        const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
+        const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
 
     Fq x;
     Fq y;
diff --git a/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/cpp/src/barretenberg/ecc/groups/element_impl.hpp
index 2deae83239..da98fb74f8 100644
--- a/cpp/src/barretenberg/ecc/groups/element_impl.hpp
+++ b/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/groups/element.hpp"
 #include "element.hpp"
 
@@ -687,77 +688,130 @@ element<Fq, Fr, T> element<Fq, Fr, T>::mul_with_endomorphism(const Fr& exponent)
     return work_element;
 }
 
+/**
+ * @brief Multiply each point by the same exponent
+ *
+ * @details We use the fact that all points are being multiplied by the same exponent to batch the operations (perform
+ * batch affine additions and doublings with batch inversion trick)
+ *
+ * @param points The span of individual points that need to be scaled
+ * @param exponent The scalar we multiply all the points by
+ * @return std::vector<affine_element<Fq, Fr, T>> Vector of new points where each point is exponent⋅points[i]
+ */
 template <class Fq, class Fr, class T>
 std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomorphism(
-    const std::vector<affine_element<Fq, Fr, T>>& points, const Fr& exponent) noexcept
+    const std::span<affine_element<Fq, Fr, T>>& points, const Fr& exponent) noexcept
 {
     typedef affine_element<Fq, Fr, T> affine_element;
     const size_t num_points = points.size();
+
+    // Space for temporary values
     std::vector<Fq> scratch_space(num_points);
 
     // we can mutate rhs but NOT lhs!
     // output is stored in rhs
-    const auto batch_affine_add = [num_points, &scratch_space](const affine_element* lhs, affine_element* rhs) {
-        Fq batch_inversion_accumulator = Fq::one();
-
-        for (size_t i = 0; i < num_points; i += 1) {
-            scratch_space[i] = lhs[i].x + rhs[i].x;  // x2 + x1
-            rhs[i].x -= lhs[i].x;                    // x2 - x1
-            rhs[i].y -= lhs[i].y;                    // y2 - y1
-            rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old
-            batch_inversion_accumulator *= (rhs[i].x);
-        }
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-
-        for (size_t i = (num_points)-1; i < num_points; i -= 1) {
-            rhs[i].y *= batch_inversion_accumulator; // update accumulator
-            batch_inversion_accumulator *= rhs[i].x;
-            rhs[i].x = rhs[i].y.sqr();
-            rhs[i].x = rhs[i].x - (scratch_space[i]); // x3 = lambda_squared - x2
-                                                      // - x1
-            scratch_space[i] = lhs[i].x - rhs[i].x;
-            scratch_space[i] *= rhs[i].y;
-            rhs[i].y = scratch_space[i] - lhs[i].y;
-        }
+    /**
+     * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
+     *
+     */
+    const auto batch_affine_add_chunked =
+        [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) {
+            Fq batch_inversion_accumulator = Fq::one();
+
+            for (size_t i = 0; i < point_count; i += 1) {
+                personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1
+                rhs[i].x -= lhs[i].x;                            // x2 - x1
+                rhs[i].y -= lhs[i].y;                            // y2 - y1
+                rhs[i].y *= batch_inversion_accumulator;         // (y2 - y1)*accumulator_old
+                batch_inversion_accumulator *= (rhs[i].x);
+            }
+            batch_inversion_accumulator = batch_inversion_accumulator.invert();
+
+            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
+                rhs[i].y *= batch_inversion_accumulator; // update accumulator
+                batch_inversion_accumulator *= rhs[i].x;
+                rhs[i].x = rhs[i].y.sqr();
+                rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2
+                                                                   // - x1
+                personal_scratch_space[i] = lhs[i].x - rhs[i].x;
+                personal_scratch_space[i] *= rhs[i].y;
+                rhs[i].y = personal_scratch_space[i] - lhs[i].y;
+            }
+        };
+
+    /**
+     * @brief Perform batch affine addition in parallel
+     *
+     */
+    const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs,
+                                                                                          affine_element* rhs) {
+        run_loop_in_parallel(
+            num_points,
+            [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
+                batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
     };
 
-    // double the elements in lhs
-    const auto batch_affine_double = [num_points, &scratch_space](affine_element* lhs) {
-        Fq batch_inversion_accumulator = Fq::one();
+    /**
+     * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion
+     *
+     */
+    const auto batch_affine_double_chunked =
+        [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) {
+            Fq batch_inversion_accumulator = Fq::one();
 
-        for (size_t i = 0; i < num_points; i += 1) {
+            for (size_t i = 0; i < point_count; i += 1) {
 
-            scratch_space[i] = lhs[i].x.sqr();
-            scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i];
+                personal_scratch_space[i] = lhs[i].x.sqr();
+                personal_scratch_space[i] =
+                    personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i];
 
-            scratch_space[i] *= batch_inversion_accumulator;
+                personal_scratch_space[i] *= batch_inversion_accumulator;
 
-            batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
-        }
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
+                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
+            }
+            batch_inversion_accumulator = batch_inversion_accumulator.invert();
 
-        Fq temp;
-        for (size_t i = (num_points)-1; i < num_points; i -= 1) {
+            Fq temp;
+            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
 
-            scratch_space[i] *= batch_inversion_accumulator;
-            batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
+                personal_scratch_space[i] *= batch_inversion_accumulator;
+                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
 
-            temp = lhs[i].x;
-            lhs[i].x = scratch_space[i].sqr() - (lhs[i].x + lhs[i].x);
-            lhs[i].y = scratch_space[i] * (temp - lhs[i].x) - lhs[i].y;
-        }
+                temp = lhs[i].x;
+                lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x);
+                lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y;
+            }
+        };
+    /**
+     * @brief Perform point doubling in parallel
+     *
+     */
+    const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) {
+        run_loop_in_parallel(
+            num_points,
+            [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) {
+                batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start);
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
     };
-
     // Compute wnaf for scalar
     const Fr converted_scalar = exponent.from_montgomery_form();
 
+    // If the scalar is zero, just set results to the point at infinity
     if (converted_scalar.is_zero()) {
         affine_element result{ Fq::zero(), Fq::zero() };
         result.self_set_infinity();
-        std::vector<affine_element> results;
-        for (size_t i = 0; i < num_points; ++i) {
-            results.emplace_back(result);
-        }
+        std::vector<affine_element> results(num_points);
+        run_loop_in_parallel(
+            num_points,
+            [&results, result](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    results[i] = result;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         return results;
     }
 
@@ -768,90 +822,132 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     for (auto& table : lookup_table) {
         table.resize(num_points);
     }
+    // Initialize first etnries in lookup table
     std::vector<affine_element> temp_point_vector(num_points);
-    for (size_t i = 0; i < num_points; ++i) {
-        temp_point_vector[i] = points[i];
-        lookup_table[0][i] = points[i];
-    }
+    run_loop_in_parallel(
+        num_points,
+        [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) {
+            for (size_t i = start; i < end; ++i) {
+                temp_point_vector[i] = points[i];
+                lookup_table[0][i] = points[i];
+            }
+        },
+        /*no_multhreading_if_less_or_equal=*/16);
+
+    // Construct lookup table
     batch_affine_double(&temp_point_vector[0]);
     for (size_t j = 1; j < lookup_size; ++j) {
-
-        for (size_t i = 0; i < num_points; ++i) {
-            lookup_table[j][i] = lookup_table[j - 1][i];
-        }
+        run_loop_in_parallel(
+            num_points,
+            [j, &lookup_table](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    lookup_table[j][i] = lookup_table[j - 1][i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]);
     }
 
     uint64_t wnaf_table[num_rounds * 2];
     Fr endo_scalar;
+
+    // Split single scalar into 2 scalar in endo form
     Fr::split_into_endomorphism_scalars(converted_scalar, endo_scalar, *(Fr*)&endo_scalar.data[2]); // NOLINT
 
     bool skew = false;
     bool endo_skew = false;
 
+    // Construct wnaf forms from scalars
     wnaf::fixed_wnaf(&endo_scalar.data[0], &wnaf_table[0], skew, 0, 2, num_wnaf_bits);
     wnaf::fixed_wnaf(&endo_scalar.data[2], &wnaf_table[1], endo_skew, 0, 2, num_wnaf_bits);
 
     std::vector<affine_element> work_elements(num_points);
 
+    constexpr Fq beta = Fq::cube_root_of_unity();
     uint64_t wnaf_entry = 0;
     uint64_t index = 0;
     bool sign = 0;
-    Fq beta = Fq::cube_root_of_unity();
-
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < num_points; ++j) {
-            wnaf_entry = wnaf_table[i];
-            index = wnaf_entry & 0x0fffffffU;
-            sign = static_cast<bool>((wnaf_entry >> 31) & 1);
-            const bool is_odd = ((i & 1) == 1);
-            auto to_add = lookup_table[static_cast<size_t>(index)][j];
-            to_add.y.self_conditional_negate(sign ^ is_odd);
-            if (is_odd) {
-                to_add.x *= beta;
-            }
-            if (i == 0) {
-                work_elements[j] = to_add;
-            } else {
-                temp_point_vector[j] = to_add;
-            }
-        }
+    // Prepare elements for the first batch addition
+    for (size_t j = 0; j < 2; ++j) {
+        wnaf_entry = wnaf_table[j];
+        index = wnaf_entry & 0x0fffffffU;
+        sign = static_cast<bool>((wnaf_entry >> 31) & 1);
+        const bool is_odd = ((j & 1) == 1);
+        run_loop_in_parallel(
+            num_points,
+            [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start,
+                                                                                              size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                    to_add.y.self_conditional_negate(sign ^ is_odd);
+                    if (is_odd) {
+                        to_add.x *= beta;
+                    }
+                    if (j == 0) {
+                        work_elements[i] = to_add;
+                    } else {
+                        temp_point_vector[i] = to_add;
+                    }
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
     }
+    // First cycle of addition
     batch_affine_add(&temp_point_vector[0], &work_elements[0]);
-
-    for (size_t i = 2; i < num_rounds * 2; ++i) {
-        wnaf_entry = wnaf_table[i];
+    // Run through SM logic in wnaf form (excluding the skew)
+    for (size_t j = 2; j < num_rounds * 2; ++j) {
+        wnaf_entry = wnaf_table[j];
         index = wnaf_entry & 0x0fffffffU;
         sign = static_cast<bool>((wnaf_entry >> 31) & 1);
-        const bool is_odd = ((i & 1) == 1);
+        const bool is_odd = ((j & 1) == 1);
         if (!is_odd) {
             for (size_t k = 0; k < 4; ++k) {
                 batch_affine_double(&work_elements[0]);
             }
         }
-        for (size_t j = 0; j < num_points; ++j) {
-            auto to_add = lookup_table[static_cast<size_t>(index)][j];
-            to_add.y.self_conditional_negate(sign ^ is_odd);
-            if (is_odd) {
-                to_add.x *= beta;
-            }
-            temp_point_vector[j] = to_add;
-        }
+        run_loop_in_parallel(
+            num_points,
+            [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                    to_add.y.self_conditional_negate(sign ^ is_odd);
+                    if (is_odd) {
+                        to_add.x *= beta;
+                    }
+                    temp_point_vector[i] = to_add;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }
 
+    // Apply skew for the first endo scalar
     if (skew) {
-        for (size_t j = 0; j < num_points; ++j) {
-            temp_point_vector[j] = -lookup_table[0][j];
-        }
+        run_loop_in_parallel(
+            num_points,
+            [&lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    temp_point_vector[i] = -lookup_table[0][i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }
-
+    // Apply skew for the second endo scalar
     if (endo_skew) {
-        for (size_t j = 0; j < num_points; ++j) {
-            temp_point_vector[j] = lookup_table[0][j];
-            temp_point_vector[j].x *= beta;
-        }
+        run_loop_in_parallel(
+            num_points,
+            [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    temp_point_vector[i] = lookup_table[0][i];
+                    temp_point_vector[i].x *= beta;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }