diff --git a/cpp/src/barretenberg/benchmark/CMakeLists.txt b/cpp/src/barretenberg/benchmark/CMakeLists.txt
index 49bcb251f0..4a13dff546 100644
--- a/cpp/src/barretenberg/benchmark/CMakeLists.txt
+++ b/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(ipa_bench)
 add_subdirectory(decrypt_bench)
 add_subdirectory(pippenger_bench)
 add_subdirectory(plonk_bench)
diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
new file mode 100644
index 0000000000..6e66ce2cf3
--- /dev/null
+++ b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Each source represents a separate benchmark suite 
+set(BENCHMARK_SOURCES
+ ipa.bench.cpp
+)
+
+# Required libraries for benchmark suites
+set(LINKED_LIBRARIES
+  benchmark::benchmark
+  ultra_honk
+)
+
+# Add executable and custom target for each suite, e.g. ultra_honk_bench
+foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
+  get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
+  add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
+  target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
+  add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
\ No newline at end of file
diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
new file mode 100644
index 0000000000..9d4f7028e5
--- /dev/null
+++ b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
@@ -0,0 +1,73 @@
+#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
+#include <benchmark/benchmark.h>
+
+using namespace benchmark;
+using namespace barretenberg;
+using namespace proof_system;
+using namespace proof_system::honk::pcs::ipa;
+namespace {
+using Curve = curve::Grumpkin;
+using Fr = Curve::ScalarField;
+using IPA = IPA<Curve>;
+using OpeningPair = honk::pcs::OpeningPair<Curve>;
+using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
+using Polynomial = Polynomial<Curve::ScalarField>;
+using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
+using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;
+
+constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
+constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
+std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
+    new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));
+
+auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+
+std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
+                                                                      MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+
+void ipa_open(State& state) noexcept
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t n = 1 << static_cast<size_t>(state.range(0));
+        // Construct the polynomial
+        Polynomial poly(n);
+        for (size_t i = 0; i < n; ++i) {
+            poly[i] = Fr::random_element(&engine);
+        }
+        auto x = Fr::random_element(&engine);
+        auto eval = poly.evaluate(x);
+        const OpeningPair opening_pair = { x, eval };
+        const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
+        // initialize empty prover transcript
+        auto prover_transcript = std::make_shared<honk::BaseTranscript>();
+        state.ResumeTiming();
+        // Compute proof
+        IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
+        // Store info for verifier
+        prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
+        opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
+    }
+}
+void ipa_verify(State& state) noexcept
+{
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Retrieve proofs
+        auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        // initialize verifier transcript from proof data
+        auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);
+
+        state.ResumeTiming();
+        auto result = IPA::verify(vk, opening_claim, verifier_transcript);
+        ASSERT(result);
+    }
+}
+} // namespace
+BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index 85cfb13e66..460f5d6dc4 100644
--- a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -52,46 +52,67 @@ template <typename Curve> class IPA {
         auto a_vec = polynomial;
         auto srs_elements = ck->srs->get_monomial_points();
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         std::vector<Fr> b_vec(poly_degree);
-        Fr b_power = 1;
-        for (size_t i = 0; i < poly_degree; i++) {
-            b_vec[i] = b_power;
-            b_power *= opening_pair.challenge;
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&b_vec, &opening_pair](size_t start, size_t end) {
+                Fr b_power = opening_pair.challenge.pow(start);
+                for (size_t i = start; i < end; i++) {
+                    b_vec[i] = b_power;
+                    b_power *= opening_pair.challenge;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
         std::vector<GroupElement> L_elements(log_poly_degree);
         std::vector<GroupElement> R_elements(log_poly_degree);
         std::size_t round_size = poly_degree;
 
-        // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
-        // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
-        // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
-        // loop can also be integrated.
+        // Perform IPA rounds
         for (size_t i = 0; i < log_poly_degree; i++) {
             round_size >>= 1;
             // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
+            std::mutex addition_lock;
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
-            for (size_t j = 0; j < round_size; j++) {
-                inner_prod_L += a_vec[j] * b_vec[round_size + j];
-                inner_prod_R += a_vec[round_size + j] * b_vec[j];
-            }
+            // Run scalar product in parallel
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
+                    Fr current_inner_prod_L = Fr::zero();
+                    Fr current_inner_prod_R = Fr::zero();
+                    for (size_t j = start; j < end; j++) {
+                        current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
+                        current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
+                    }
+                    addition_lock.lock();
+                    inner_prod_L += current_inner_prod_L;
+                    inner_prod_R += current_inner_prod_R;
+                    addition_lock.unlock();
+                },
+                /*no_multhreading_if_less_or_equal=*/8);
+
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
-            L_elements[i] =
-                // TODO(#473)
-                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
-                    &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
+            L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
+                &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
             L_elements[i] += aux_generator * inner_prod_L;
 
             // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
-            // TODO(#473)
             R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
                 &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
             R_elements[i] += aux_generator * inner_prod_R;
@@ -104,23 +125,32 @@ template <typename Curve> class IPA {
             const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
             const Fr round_challenge_inv = round_challenge.invert();
 
-            std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
-            std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
-            G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
-            G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
+            auto G_lo = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                round_challenge_inv);
+            auto G_hi = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
+                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                round_challenge);
 
             // Update the vectors a_vec, b_vec and G_vec.
             // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
             // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
             // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
-            for (size_t j = 0; j < round_size; j++) {
-                a_vec[j] *= round_challenge;
-                a_vec[j] += round_challenge_inv * a_vec[round_size + j];
-                b_vec[j] *= round_challenge_inv;
-                b_vec[j] += round_challenge * b_vec[round_size + j];
-
-                G_vec_local[j] = G_lo[j] + G_hi[j];
-            }
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
+                    size_t start, size_t end) {
+                    for (size_t j = start; j < end; j++) {
+                        a_vec[j] *= round_challenge;
+                        a_vec[j] += round_challenge_inv * a_vec[round_size + j];
+                        b_vec[j] *= round_challenge_inv;
+                        b_vec[j] += round_challenge * b_vec[round_size + j];
+
+                        G_vec_local[j] = G_lo[j] + G_hi[j];
+                    }
+                },
+                /*no_multhreading_if_less_or_equal=*/4);
         }
 
         transcript->send_to_verifier("IPA:a_0", a_vec[0]);
@@ -166,7 +196,7 @@ template <typename Curve> class IPA {
             msm_scalars[2 * i] = round_challenges[i].sqr();
             msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
         }
-        // TODO(#473)
+
         GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
         GroupElement C_zero = C_prime + LR_sums;
@@ -188,29 +218,42 @@ template <typename Curve> class IPA {
         // Compute G_zero
         // First construct s_vec
         std::vector<Fr> s_vec(poly_degree);
-        for (size_t i = 0; i < poly_degree; i++) {
-            Fr s_vec_scalar = Fr::one();
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                auto bit = (i >> j) & 1;
-                bool b = static_cast<bool>(bit);
-                if (b) {
-                    s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
-                } else {
-                    s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+        run_loop_in_parallel(
+            poly_degree,
+            [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    Fr s_vec_scalar = Fr::one();
+                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+                        auto bit = (i >> j) & 1;
+                        bool b = static_cast<bool>(bit);
+                        if (b) {
+                            s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
+                        } else {
+                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+                        }
+                    }
+                    s_vec[i] = s_vec_scalar;
                 }
-            }
-            s_vec[i] = s_vec_scalar;
-        }
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
+
         auto srs_elements = vk->srs->get_monomial_points();
+
         // Copy the G_vector to local memory.
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
-        // TODO(#473)
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
 
diff --git a/cpp/src/barretenberg/common/thread.cpp b/cpp/src/barretenberg/common/thread.cpp
index 6d3007c873..232c30be69 100644
--- a/cpp/src/barretenberg/common/thread.cpp
+++ b/cpp/src/barretenberg/common/thread.cpp
@@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
 #endif
 #endif
 }
+
+/**
+ * @brief Split a loop into several loops running in parallel
+ *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
+ * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
+ *
+ */
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal)
+{
+    if (num_points <= no_multhreading_if_less_or_equal) {
+        func(0, num_points);
+        return;
+    }
+    // Get number of cpus we can split into
+    const size_t num_cpus = get_num_cpus();
+
+    // Compute the size of a single chunk
+    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
+    // Parallelize over chunks
+    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
+        // If num_points is small, sometimes we need fewer CPUs
+        if (chunk_size * chunk_index > num_points) {
+            return;
+        }
+        // Compute the current chunk size (can differ in case it's the last chunk)
+        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
+        if (current_chunk_size == 0) {
+            return;
+        }
+        size_t start = chunk_index * chunk_size;
+        size_t end = chunk_index * chunk_size + current_chunk_size;
+        func(start, end);
+    });
+};
diff --git a/cpp/src/barretenberg/common/thread.hpp b/cpp/src/barretenberg/common/thread.hpp
index 96e3df7409..fe799e1d46 100644
--- a/cpp/src/barretenberg/common/thread.hpp
+++ b/cpp/src/barretenberg/common/thread.hpp
@@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
 }
 
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal = 0);
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/groups/element.hpp b/cpp/src/barretenberg/ecc/groups/element.hpp
index 0b2d13761f..420b464985 100644
--- a/cpp/src/barretenberg/ecc/groups/element.hpp
+++ b/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
 
     static void batch_normalize(element* elements, size_t num_elements) noexcept;
     static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
-        const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
+        const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
 
     Fq x;
     Fq y;
diff --git a/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/cpp/src/barretenberg/ecc/groups/element_impl.hpp
index 2deae83239..da98fb74f8 100644
--- a/cpp/src/barretenberg/ecc/groups/element_impl.hpp
+++ b/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/groups/element.hpp"
 #include "element.hpp"
 
@@ -687,77 +688,130 @@ element<Fq, Fr, T> element<Fq, Fr, T>::mul_with_endomorphism(const Fr& exponent)
     return work_element;
 }
 
+/**
+ * @brief Multiply each point by the same exponent
+ *
+ * @details We use the fact that all points are being multiplied by the same exponent to batch the operations (perform
+ * batch affine additions and doublings with batch inversion trick)
+ *
+ * @param points The span of individual points that need to be scaled
+ * @param exponent The scalar we multiply all the points by
+ * @return std::vector<affine_element<Fq, Fr, T>> Vector of new points where each point is exponent⋅points[i]
+ */
 template <class Fq, class Fr, class T>
 std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomorphism(
-    const std::vector<affine_element<Fq, Fr, T>>& points, const Fr& exponent) noexcept
+    const std::span<affine_element<Fq, Fr, T>>& points, const Fr& exponent) noexcept
 {
     typedef affine_element<Fq, Fr, T> affine_element;
     const size_t num_points = points.size();
+
+    // Space for temporary values
     std::vector<Fq> scratch_space(num_points);
 
     // we can mutate rhs but NOT lhs!
     // output is stored in rhs
-    const auto batch_affine_add = [num_points, &scratch_space](const affine_element* lhs, affine_element* rhs) {
-        Fq batch_inversion_accumulator = Fq::one();
-
-        for (size_t i = 0; i < num_points; i += 1) {
-            scratch_space[i] = lhs[i].x + rhs[i].x;  // x2 + x1
-            rhs[i].x -= lhs[i].x;                    // x2 - x1
-            rhs[i].y -= lhs[i].y;                    // y2 - y1
-            rhs[i].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old
-            batch_inversion_accumulator *= (rhs[i].x);
-        }
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-
-        for (size_t i = (num_points)-1; i < num_points; i -= 1) {
-            rhs[i].y *= batch_inversion_accumulator; // update accumulator
-            batch_inversion_accumulator *= rhs[i].x;
-            rhs[i].x = rhs[i].y.sqr();
-            rhs[i].x = rhs[i].x - (scratch_space[i]); // x3 = lambda_squared - x2
-                                                      // - x1
-            scratch_space[i] = lhs[i].x - rhs[i].x;
-            scratch_space[i] *= rhs[i].y;
-            rhs[i].y = scratch_space[i] - lhs[i].y;
-        }
+    /**
+     * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
+     *
+     */
+    const auto batch_affine_add_chunked =
+        [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) {
+            Fq batch_inversion_accumulator = Fq::one();
+
+            for (size_t i = 0; i < point_count; i += 1) {
+                personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1
+                rhs[i].x -= lhs[i].x;                            // x2 - x1
+                rhs[i].y -= lhs[i].y;                            // y2 - y1
+                rhs[i].y *= batch_inversion_accumulator;         // (y2 - y1)*accumulator_old
+                batch_inversion_accumulator *= (rhs[i].x);
+            }
+            batch_inversion_accumulator = batch_inversion_accumulator.invert();
+
+            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
+                rhs[i].y *= batch_inversion_accumulator; // update accumulator
+                batch_inversion_accumulator *= rhs[i].x;
+                rhs[i].x = rhs[i].y.sqr();
+                rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2
+                                                                   // - x1
+                personal_scratch_space[i] = lhs[i].x - rhs[i].x;
+                personal_scratch_space[i] *= rhs[i].y;
+                rhs[i].y = personal_scratch_space[i] - lhs[i].y;
+            }
+        };
+
+    /**
+     * @brief Perform batch affine addition in parallel
+     *
+     */
+    const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs,
+                                                                                          affine_element* rhs) {
+        run_loop_in_parallel(
+            num_points,
+            [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
+                batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
     };
 
-    // double the elements in lhs
-    const auto batch_affine_double = [num_points, &scratch_space](affine_element* lhs) {
-        Fq batch_inversion_accumulator = Fq::one();
+    /**
+     * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion
+     *
+     */
+    const auto batch_affine_double_chunked =
+        [](affine_element* lhs, const size_t point_count, Fq* personal_scratch_space) {
+            Fq batch_inversion_accumulator = Fq::one();
 
-        for (size_t i = 0; i < num_points; i += 1) {
+            for (size_t i = 0; i < point_count; i += 1) {
 
-            scratch_space[i] = lhs[i].x.sqr();
-            scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i];
+                personal_scratch_space[i] = lhs[i].x.sqr();
+                personal_scratch_space[i] =
+                    personal_scratch_space[i] + personal_scratch_space[i] + personal_scratch_space[i];
 
-            scratch_space[i] *= batch_inversion_accumulator;
+                personal_scratch_space[i] *= batch_inversion_accumulator;
 
-            batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
-        }
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
+                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
+            }
+            batch_inversion_accumulator = batch_inversion_accumulator.invert();
 
-        Fq temp;
-        for (size_t i = (num_points)-1; i < num_points; i -= 1) {
+            Fq temp;
+            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
 
-            scratch_space[i] *= batch_inversion_accumulator;
-            batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
+                personal_scratch_space[i] *= batch_inversion_accumulator;
+                batch_inversion_accumulator *= (lhs[i].y + lhs[i].y);
 
-            temp = lhs[i].x;
-            lhs[i].x = scratch_space[i].sqr() - (lhs[i].x + lhs[i].x);
-            lhs[i].y = scratch_space[i] * (temp - lhs[i].x) - lhs[i].y;
-        }
+                temp = lhs[i].x;
+                lhs[i].x = personal_scratch_space[i].sqr() - (lhs[i].x + lhs[i].x);
+                lhs[i].y = personal_scratch_space[i] * (temp - lhs[i].x) - lhs[i].y;
+            }
+        };
+    /**
+     * @brief Perform point doubling in parallel
+     *
+     */
+    const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) {
+        run_loop_in_parallel(
+            num_points,
+            [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) {
+                batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start);
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
     };
-
     // Compute wnaf for scalar
     const Fr converted_scalar = exponent.from_montgomery_form();
 
+    // If the scalar is zero, just set results to the point at infinity
     if (converted_scalar.is_zero()) {
         affine_element result{ Fq::zero(), Fq::zero() };
         result.self_set_infinity();
-        std::vector<affine_element> results;
-        for (size_t i = 0; i < num_points; ++i) {
-            results.emplace_back(result);
-        }
+        std::vector<affine_element> results(num_points);
+        run_loop_in_parallel(
+            num_points,
+            [&results, result](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    results[i] = result;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         return results;
     }
 
@@ -768,90 +822,132 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     for (auto& table : lookup_table) {
         table.resize(num_points);
     }
+    // Initialize first etnries in lookup table
     std::vector<affine_element> temp_point_vector(num_points);
-    for (size_t i = 0; i < num_points; ++i) {
-        temp_point_vector[i] = points[i];
-        lookup_table[0][i] = points[i];
-    }
+    run_loop_in_parallel(
+        num_points,
+        [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) {
+            for (size_t i = start; i < end; ++i) {
+                temp_point_vector[i] = points[i];
+                lookup_table[0][i] = points[i];
+            }
+        },
+        /*no_multhreading_if_less_or_equal=*/16);
+
+    // Construct lookup table
     batch_affine_double(&temp_point_vector[0]);
     for (size_t j = 1; j < lookup_size; ++j) {
-
-        for (size_t i = 0; i < num_points; ++i) {
-            lookup_table[j][i] = lookup_table[j - 1][i];
-        }
+        run_loop_in_parallel(
+            num_points,
+            [j, &lookup_table](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    lookup_table[j][i] = lookup_table[j - 1][i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]);
     }
 
     uint64_t wnaf_table[num_rounds * 2];
     Fr endo_scalar;
+
+    // Split single scalar into 2 scalar in endo form
     Fr::split_into_endomorphism_scalars(converted_scalar, endo_scalar, *(Fr*)&endo_scalar.data[2]); // NOLINT
 
     bool skew = false;
     bool endo_skew = false;
 
+    // Construct wnaf forms from scalars
     wnaf::fixed_wnaf(&endo_scalar.data[0], &wnaf_table[0], skew, 0, 2, num_wnaf_bits);
     wnaf::fixed_wnaf(&endo_scalar.data[2], &wnaf_table[1], endo_skew, 0, 2, num_wnaf_bits);
 
     std::vector<affine_element> work_elements(num_points);
 
+    constexpr Fq beta = Fq::cube_root_of_unity();
     uint64_t wnaf_entry = 0;
     uint64_t index = 0;
     bool sign = 0;
-    Fq beta = Fq::cube_root_of_unity();
-
-    for (size_t i = 0; i < 2; ++i) {
-        for (size_t j = 0; j < num_points; ++j) {
-            wnaf_entry = wnaf_table[i];
-            index = wnaf_entry & 0x0fffffffU;
-            sign = static_cast<bool>((wnaf_entry >> 31) & 1);
-            const bool is_odd = ((i & 1) == 1);
-            auto to_add = lookup_table[static_cast<size_t>(index)][j];
-            to_add.y.self_conditional_negate(sign ^ is_odd);
-            if (is_odd) {
-                to_add.x *= beta;
-            }
-            if (i == 0) {
-                work_elements[j] = to_add;
-            } else {
-                temp_point_vector[j] = to_add;
-            }
-        }
+    // Prepare elements for the first batch addition
+    for (size_t j = 0; j < 2; ++j) {
+        wnaf_entry = wnaf_table[j];
+        index = wnaf_entry & 0x0fffffffU;
+        sign = static_cast<bool>((wnaf_entry >> 31) & 1);
+        const bool is_odd = ((j & 1) == 1);
+        run_loop_in_parallel(
+            num_points,
+            [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start,
+                                                                                              size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                    to_add.y.self_conditional_negate(sign ^ is_odd);
+                    if (is_odd) {
+                        to_add.x *= beta;
+                    }
+                    if (j == 0) {
+                        work_elements[i] = to_add;
+                    } else {
+                        temp_point_vector[i] = to_add;
+                    }
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
     }
+    // First cycle of addition
     batch_affine_add(&temp_point_vector[0], &work_elements[0]);
-
-    for (size_t i = 2; i < num_rounds * 2; ++i) {
-        wnaf_entry = wnaf_table[i];
+    // Run through SM logic in wnaf form (excluding the skew)
+    for (size_t j = 2; j < num_rounds * 2; ++j) {
+        wnaf_entry = wnaf_table[j];
         index = wnaf_entry & 0x0fffffffU;
         sign = static_cast<bool>((wnaf_entry >> 31) & 1);
-        const bool is_odd = ((i & 1) == 1);
+        const bool is_odd = ((j & 1) == 1);
         if (!is_odd) {
             for (size_t k = 0; k < 4; ++k) {
                 batch_affine_double(&work_elements[0]);
             }
         }
-        for (size_t j = 0; j < num_points; ++j) {
-            auto to_add = lookup_table[static_cast<size_t>(index)][j];
-            to_add.y.self_conditional_negate(sign ^ is_odd);
-            if (is_odd) {
-                to_add.x *= beta;
-            }
-            temp_point_vector[j] = to_add;
-        }
+        run_loop_in_parallel(
+            num_points,
+            [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                    to_add.y.self_conditional_negate(sign ^ is_odd);
+                    if (is_odd) {
+                        to_add.x *= beta;
+                    }
+                    temp_point_vector[i] = to_add;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }
 
+    // Apply skew for the first endo scalar
     if (skew) {
-        for (size_t j = 0; j < num_points; ++j) {
-            temp_point_vector[j] = -lookup_table[0][j];
-        }
+        run_loop_in_parallel(
+            num_points,
+            [&lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    temp_point_vector[i] = -lookup_table[0][i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }
-
+    // Apply skew for the second endo scalar
     if (endo_skew) {
-        for (size_t j = 0; j < num_points; ++j) {
-            temp_point_vector[j] = lookup_table[0][j];
-            temp_point_vector[j].x *= beta;
-        }
+        run_loop_in_parallel(
+            num_points,
+            [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+
+                    temp_point_vector[i] = lookup_table[0][i];
+                    temp_point_vector[i].x *= beta;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
         batch_affine_add(&temp_point_vector[0], &work_elements[0]);
     }