From fd1f619f443916c172b6311aa71a84153145ef4d Mon Sep 17 00:00:00 2001
From: Innokentii Sennovskii <isennovskiy@gmail.com>
Date: Sat, 13 Jan 2024 16:38:01 +0000
Subject: [PATCH] feat: Benchmarks for basic functionality and IPA improvements
 (#4004)

This PR:
1. Adds benchmarks to gauge how much operations such as finite field
addition and splitting into parallel threads with parallel_for cost
2. Introduces a new function `run_loop_in_parallel_if_effective` which
calculates if it's worth splitting the workload into separate threads
before doing it
3. Updates IPA and the batch_mul_with_endomorphism to use this function
4. Adds batch_affine_add method to element and uses it to make IPA
faster.
---
 .../src/barretenberg/benchmark/CMakeLists.txt |   3 +-
 .../benchmark/basics_bench/CMakeLists.txt     |  18 +
 .../basics_bench/analyse_all_benchmarks.py    |  88 ++++
 .../benchmark/basics_bench/basics.bench.cpp   | 382 ++++++++++++++++++
 .../basics_bench/single_benchmark_analysis.py |  46 +++
 .../commitment_schemes/ipa/ipa.hpp            |  47 ++-
 .../cpp/src/barretenberg/common/thread.cpp    |  77 ++++
 .../cpp/src/barretenberg/common/thread.hpp    |  11 +-
 .../src/barretenberg/ecc/groups/element.hpp   |   3 +
 .../barretenberg/ecc/groups/element_impl.hpp  | 192 +++++++--
 10 files changed, 817 insertions(+), 50 deletions(-)
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp
 create mode 100644 barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py

diff --git a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
index 968acb82531..285f2bb5937 100644
--- a/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
+++ b/barretenberg/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -1,9 +1,10 @@
-add_subdirectory(ipa_bench)
 add_subdirectory(decrypt_bench)
+add_subdirectory(ipa_bench)
 add_subdirectory(pippenger_bench)
 add_subdirectory(plonk_bench)
 add_subdirectory(ultra_bench)
 add_subdirectory(goblin_bench)
+add_subdirectory(basics_bench)
 add_subdirectory(relations_bench)
 add_subdirectory(widgets_bench)
 add_subdirectory(protogalaxy_bench)
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt
new file mode 100644
index 00000000000..d23c4f6597f
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Each source represents a separate benchmark suite 
+set(BENCHMARK_SOURCES
+ basics.bench.cpp
+)
+
+# Required libraries for benchmark suites
+set(LINKED_LIBRARIES
+  benchmark::benchmark
+  ecc
+)
+
+# Add executable and custom target for each suite, e.g. ultra_honk_bench
+foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
+  get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
+  add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
+  target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
+  add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py
new file mode 100644
index 00000000000..5edebdb815c
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+"""
+Tool for analysing several benchmarks from basics_bench to calculate operation timings
+For example, in src directory:
+python3 ../src/barretenberg/benchmark/basics_bench/analyse_all_benchmarks.py -f bin/basics_bench
+"""
+import argparse
+import subprocess
+import tempfile
+from single_benchmark_analysis import evaluate_benchmark_from_file
+import os
+
+# Some of the benchmarks use other operations to randomise the procedure, so we need to subtract the results
+filter_rules={
+    "sequential_copy":"cycle_waste",
+    "cycle_waste":None,
+    "parallel_for_field_element_addition:":None,
+    "ff_addition":"cycle_waste",
+    "ff_multiplication":"cycle_waste",
+    "ff_sqr":"cycle_waste",
+    "ff_invert":"ff_addition",
+    "ff_to_montgomery":"cycle_waste",
+    "ff_from_montgomery":"cycle_waste",
+    "ff_reduce":"ff_addition",
+    "projective_point_addition":"cycle_waste",
+    "projective_point_accidental_doubling":"cycle_waste",
+    "projective_point_doubling":"cycle_waste",
+    "scalar_multiplication":"ff_addition",
+}
+def get_benchmarks(filename):
+    """
+    Get a list of benchmarks from the binary
+    """
+    result=subprocess.run([filename,"--benchmark_list_tests"],capture_output=True)
+    result.check_returncode()
+    output_lines=result.stdout.splitlines()
+    benchmark_names=set([x.decode().split('/')[0] for x in output_lines])
+    return sorted(list(benchmark_names))
+
+def run_benchmarks(filename,bnames):
+    """
+    Run benchmarks for each type and collect results
+    """
+    benchmark_results=dict()
+    for bname in bnames:
+        output_file=tempfile.mktemp()
+        result=subprocess.run([filename,f"--benchmark_filter={bname}.*",f"--benchmark_out={output_file}","--benchmark_out_format=csv"])
+        result.check_returncode()
+        benchmark_result=evaluate_benchmark_from_file(output_file)*1000
+        benchmark_results[bname]=benchmark_result
+        print (f"Benchmark {bname} unfiltered: {benchmark_result} ns")
+        os.remove(output_file)
+
+    return benchmark_results
+
+def filter_benchmarks(benchmark_results):
+    """
+    Apply filtering rules and print the benchmarks
+    """
+    global filter_rules
+    print ("Filtered benchmark results:")
+    max_len=0
+    for bname in sorted(benchmark_results.keys()):
+        if len(bname)>max_len:
+            max_len=len(bname)
+    for bname in sorted(benchmark_results.keys()):
+        if bname not in filter_rules.keys() or filter_rules[bname]==None:
+            print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]:.1f}")
+        else:
+            print(f"\t{bname}:{' '*(max_len-len(bname))}\t{benchmark_results[bname]-benchmark_results[filter_rules[bname]]:.1f}")
+
+if __name__=="__main__":
+    parser=argparse.ArgumentParser(description='Run all the individual benchmarks',epilog='This expects a single file with a single type of benchmark <name>/i')
+    parser.add_argument("-f","--file",dest="filename",required=True,help="run benchmark FILE", metavar="FILE")
+    args=parser.parse_args()
+    filename=args.filename
+    if filename==None:
+        parser.print_help()
+        exit()
+    benchmark_names=get_benchmarks(filename)
+    print("Will run the following benchmarks:")
+    for bname in benchmark_names:
+        print(f'\t{bname}')
+    unfiltered_results=run_benchmarks(filename,benchmark_names)
+    filter_benchmarks(unfiltered_results)
+    
+
+    
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp
new file mode 100644
index 00000000000..32f6082d3a8
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/basics.bench.cpp
@@ -0,0 +1,382 @@
+/**
+ * @file parallel.bench.cpp
+ * @author Rumata888
+ * @brief Simple and not too strict benchmarks for most basic operations used in barretenberg
+ * @details Filtered benchmark results (nanoseconds):
+ *      cycle_waste:                            0.5
+ *      ff_addition:                            3.8
+ *      ff_from_montgomery:                     19.1
+ *      ff_invert:                              7001.3
+ *      ff_multiplication:                      21.3
+ *      ff_reduce:                              5.1
+ *      ff_sqr:                                 17.9
+ *      ff_to_montgomery:                       39.1
+ *      parallel_for_field_element_addition:    198000~388000 (The number is somewhat dependent on the number of cores
+ * used)
+ *      projective_point_accidental_doubling:   347.6
+ *      projective_point_addition:              348.6
+ *      projective_point_doubling:              194.2
+ *      scalar_multiplication:                  50060.1
+ *      sequential_copy:                        3.3
+ *
+ */
+#include "barretenberg/common/thread.hpp"
+#include "barretenberg/ecc/curves/bn254/bn254.hpp"
+#include <benchmark/benchmark.h>
+
+using namespace benchmark;
+using namespace barretenberg;
+namespace {
+using Curve = curve::BN254;
+using Fr = Curve::ScalarField;
+#define MAX_REPETITION_LOG 12
+
+/**
+ * @brief Benchmark for evaluating the cost of starting parallel_for
+ *
+ * @details It seems parallel_for takes ~400 microseconds to start when we use all the cores. When it's just 1 it's 200
+ * microseconds. The dependency is not exactly linear, so in code we use the largest value for convenience
+ * @param state
+ */
+void parallel_for_field_element_addition(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    size_t num_cpus = get_num_cpus();
+    std::vector<std::vector<Fr>> copy_vector(num_cpus);
+    for (size_t i = 0; i < num_cpus; i++) {
+        for (size_t j = 0; j < 2; j++) {
+            copy_vector[i].emplace_back(Fr::random_element(&engine));
+            copy_vector[i].emplace_back(Fr::random_element(&engine));
+        }
+    }
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_external_cycles = 1 << static_cast<size_t>(state.range(0));
+        size_t num_internal_cycles = 1 << (MAX_REPETITION_LOG - static_cast<size_t>(state.range(0)));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_external_cycles; i++) {
+            parallel_for(num_cpus, [num_internal_cycles, &copy_vector](size_t index) {
+                for (size_t i = 0; i < num_internal_cycles; i++) {
+                    copy_vector[index][i & 1] += copy_vector[index][1 - (i & 1)];
+                }
+            });
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much finite addition costs (in cache)
+ *
+ *@details ~4 ns if we subtract  i++ operation
+ * @param state
+ */
+void ff_addition(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Fr> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Fr::random_element(&engine));
+        copy_vector.emplace_back(Fr::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[i & 1] += copy_vector[1 - (i & 1)];
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much finite field multiplication costs (in cache)
+ *
+ *@details ~21 ns if we subtract i++ operation
+ * @param state
+ */
+void ff_multiplication(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Fr> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Fr::random_element(&engine));
+        copy_vector.emplace_back(Fr::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[i & 1] *= copy_vector[1 - (i & 1)];
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much finite field squaring costs (in cache)
+ *
+ *@details ~18 ns if we subtract i++ operation
+ * @param state
+ */
+void ff_sqr(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Fr> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Fr::random_element(&engine));
+        copy_vector.emplace_back(Fr::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[0] = copy_vector[0].sqr();
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much finite field inversion costs (in cache)
+ *
+ *@details ~7100 ns if we subtract addition and i++ operation
+ * @param state
+ */
+void ff_invert(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    auto element = Fr::random_element(&engine);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            element = (element + Fr::one()).invert();
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much conversion to montgomery costs (in cache)
+ *
+ *@details ~39 ns if we subtract i++ operation
+ * @param state
+ */
+void ff_to_montgomery(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    auto element = Fr::random_element(&engine);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            element = element.to_montgomery_form();
+        }
+    }
+}
+/**
+ * @brief Evaluate how much conversion from montgomery costs (in cache)
+ *
+ *@details ~19 ns if we subtract i++ operation
+ * @param state
+ */
+void ff_from_montgomery(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    auto element = Fr::random_element(&engine);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            element = element.from_montgomery_form();
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much reduction costs (in cache)
+ *
+ *@details ~5 ns if we subtract addition and i++ operation
+ * @param state
+ */
+void ff_reduce(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    auto element = Fr::random_element(&engine);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            element = (element + element).reduce_once();
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much projective point addition costs (in cache)
+ *
+ *@details ~350 ns if we subtract  i++ operation
+ * @param state
+ */
+void projective_point_addition(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Curve::Element> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[i & 1] += copy_vector[1 - (i & 1)];
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much projective point doubling costs when we trigger it through addition (in cache)
+ *
+ *@details ~354 ns if we subtract  i++ operation
+ * @param state
+ */
+void projective_point_accidental_doubling(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Curve::Element> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[0] += copy_vector[0];
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much projective point doubling costs (in cache)
+ *
+ *@details ~195 ns if we subtract  i++ operation
+ * @param state
+ */
+void projective_point_doubling(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    std::vector<Curve::Element> copy_vector(2);
+    for (size_t j = 0; j < 2; j++) {
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+        copy_vector.emplace_back(Curve::Element::random_element(&engine));
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            copy_vector[0] = copy_vector[0].dbl();
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much scalar multiplication costs (in cache)
+ *
+ *@details ~50000 ns
+ * @param state
+ */
+void scalar_multiplication(State& state)
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    Curve::Element element = Curve::Element::random_element(&engine);
+    Fr scalar = Fr::random_element(&engine);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            element = element * scalar;
+            scalar += scalar;
+        }
+    }
+}
+/**
+ * @brief Evaluate how much running the loop costs in benchmarks
+ *
+ * @details 0.6~0.7 ns per cycle
+ * @param state
+ */
+void cycle_waste(State& state)
+{
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        state.ResumeTiming();
+        for (volatile size_t i = 0; i < num_cycles;) {
+            i = i + 1;
+        }
+    }
+}
+
+/**
+ * @brief Evaluate how much copying memory for large vectors costs
+ *
+ * @details 5 ns per cycle
+ * @param state
+ */
+void sequential_copy(State& state)
+{
+
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t num_cycles = 1 << static_cast<size_t>(state.range(0));
+        std::vector<Fr> input(num_cycles);
+        for (size_t i = 0; i < num_cycles; i++) {
+            *(uint256_t*)&input[i] = engine.get_random_uint256();
+        }
+        std::vector<Fr> output(num_cycles);
+
+        state.ResumeTiming();
+        for (size_t i = 0; i < num_cycles; i++) {
+            output[i] = input[i];
+        }
+    }
+}
+} // namespace
+
+BENCHMARK(parallel_for_field_element_addition)->Unit(kMicrosecond)->DenseRange(0, MAX_REPETITION_LOG);
+BENCHMARK(ff_addition)->Unit(kMicrosecond)->DenseRange(12, 30);
+BENCHMARK(ff_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27);
+BENCHMARK(ff_sqr)->Unit(kMicrosecond)->DenseRange(12, 27);
+BENCHMARK(ff_invert)->Unit(kMicrosecond)->DenseRange(12, 19);
+BENCHMARK(ff_to_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27);
+BENCHMARK(ff_from_montgomery)->Unit(kMicrosecond)->DenseRange(12, 27);
+BENCHMARK(ff_reduce)->Unit(kMicrosecond)->DenseRange(12, 29);
+BENCHMARK(projective_point_addition)->Unit(kMicrosecond)->DenseRange(12, 22);
+BENCHMARK(projective_point_accidental_doubling)->Unit(kMicrosecond)->DenseRange(12, 22);
+BENCHMARK(projective_point_doubling)->Unit(kMicrosecond)->DenseRange(12, 22);
+BENCHMARK(scalar_multiplication)->Unit(kMicrosecond)->DenseRange(12, 18);
+BENCHMARK(cycle_waste)->Unit(kMicrosecond)->DenseRange(20, 30);
+BENCHMARK(sequential_copy)->Unit(kMicrosecond)->DenseRange(20, 25);
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py
new file mode 100644
index 00000000000..3f039a21860
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py
@@ -0,0 +1,46 @@
+#!/usr/bin/python3
+"""
+Tool for analyzing a single benchmark file generated by basics_bench. Also used by "analyse_all_benchmarks.py"
+For example, in src directory:
+./bin/basics --benchmark_filter="par.*" --benchmark_out=parallel_for.csv --benchmark_out_format=csv
+python3 ../src/barretenberg/benchmark/basics_bench/single_benchmark_analysis.py -f parallel_for.csv
+"""
+import numpy as np
+import argparse
+from io import StringIO
+
+def evaluate_benchmark_from_file(filename):
+    """
+    Take a benchmark file, remove waste and calculate the linear factor
+    """
+    lines=[]
+    header_found=False
+    x_exponents=[]
+    # Google benchmarks have a few extra lines at the start, so we need to skip them
+    with open (filename) as f:
+        for line in f:
+            if line.find("name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message")!=-1:
+                header_found=True
+                lines.append(line)
+                continue
+            if header_found:
+                lines.append(line)
+                x_exponents.append(int(line.replace('"','').split(',')[0].split('/')[1]))
+    
+    data=np.genfromtxt(StringIO('\n'.join(lines)),delimiter=",",usemask=True)
+    # Calculate the linear factor
+    y=np.transpose(data[1:])[2]
+    x=np.array([1<<i for i in x_exponents])
+    A=np.vstack([x,np.ones(len(x))]).T
+    m, c = np.linalg.lstsq(A, y, rcond=None)[0]
+    return m
+
+if __name__=="__main__":
+    parser=argparse.ArgumentParser(description='Read google-benchmarks generated benchmarks and calculate single operation cost',epilog='This expects a single file with a single type of benchmark <name>/i')
+    parser.add_argument("-f","--file",dest="filename",required=True,help="read benchmark information from FILE", metavar="FILE")
+    args=parser.parse_args()
+    filename=args.filename
+    if filename==None:
+        parser.print_help()
+        exit()
+    print (evaluate_benchmark_from_file(filename), 'microseconds')
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index 460f5d6dc49..6ef30e0d296 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -56,17 +56,23 @@ template <typename Curve> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             poly_degree,
             [&G_vec_local, srs_elements](size_t start, size_t end) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
 
         std::vector<Fr> b_vec(poly_degree);
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             poly_degree,
             [&b_vec, &opening_pair](size_t start, size_t end) {
                 Fr b_power = opening_pair.challenge.pow(start);
@@ -75,7 +81,8 @@ template <typename Curve> class IPA {
                     b_power *= opening_pair.challenge;
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/1);
 
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
@@ -91,7 +98,7 @@ template <typename Curve> class IPA {
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
             // Run scalar product in parallel
-            run_loop_in_parallel(
+            run_loop_in_parallel_if_effective(
                 round_size,
                 [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
                     Fr current_inner_prod_L = Fr::zero();
@@ -105,7 +112,8 @@ template <typename Curve> class IPA {
                     inner_prod_R += current_inner_prod_R;
                     addition_lock.unlock();
                 },
-                /*no_multhreading_if_less_or_equal=*/8);
+                /*finite_field_additions_per_iteration=*/2,
+                /*finite_field_multiplications_per_iteration=*/2);
 
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
             L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
@@ -137,20 +145,20 @@ template <typename Curve> class IPA {
             // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
             // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
             // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
-            run_loop_in_parallel(
+            run_loop_in_parallel_if_effective(
                 round_size,
-                [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
-                    size_t start, size_t end) {
+                [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) {
                     for (size_t j = start; j < end; j++) {
                         a_vec[j] *= round_challenge;
                         a_vec[j] += round_challenge_inv * a_vec[round_size + j];
                         b_vec[j] *= round_challenge_inv;
                         b_vec[j] += round_challenge * b_vec[round_size + j];
-
-                        G_vec_local[j] = G_lo[j] + G_hi[j];
                     }
                 },
-                /*no_multhreading_if_less_or_equal=*/4);
+                /*finite_field_additions_per_iteration=*/4,
+                /*finite_field_multiplications_per_iteration=*/8,
+                /*finite_field_inversions_per_iteration=*/1);
+            GroupElement::batch_affine_add(G_lo, G_hi, G_vec_local);
         }
 
         transcript->send_to_verifier("IPA:a_0", a_vec[0]);
@@ -218,7 +226,7 @@ template <typename Curve> class IPA {
         // Compute G_zero
         // First construct s_vec
         std::vector<Fr> s_vec(poly_degree);
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             poly_degree,
             [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
                 for (size_t i = start; i < end; i++) {
@@ -235,7 +243,8 @@ template <typename Curve> class IPA {
                     s_vec[i] = s_vec_scalar;
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/4);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/log_poly_degree);
 
         auto srs_elements = vk->srs->get_monomial_points();
 
@@ -245,14 +254,20 @@ template <typename Curve> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             poly_degree,
             [&G_vec_local, srs_elements](size_t start, size_t end) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
 
         auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp
index 232c30be69d..b90a44ae146 100644
--- a/barretenberg/cpp/src/barretenberg/common/thread.cpp
+++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp
@@ -127,3 +127,80 @@ void run_loop_in_parallel(size_t num_points,
         func(start, end);
     });
 };
+
+/**
+ * @brief Split a loop into several loops running in parallel based on operations in 1 iteration
+ *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop, but only if it's worth it
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
+ * @param finite_field_additions_per_iteration The number of additions/subtractions/negations
+ * @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
+ * @param finite_field_inversions_per_iteration
+ * @param group_element_additions_per_iteration Projective addition number
+ * @param group_element_doublings_per_iteration Projective doubling number
+ * @param scalar_multiplications_per_iteration
+ * @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
+ */
+void run_loop_in_parallel_if_effective(size_t num_points,
+                                       const std::function<void(size_t, size_t)>& func,
+                                       size_t finite_field_additions_per_iteration,
+                                       size_t finite_field_multiplications_per_iteration,
+                                       size_t finite_field_inversions_per_iteration,
+                                       size_t group_element_additions_per_iteration,
+                                       size_t group_element_doublings_per_iteration,
+                                       size_t scalar_multiplications_per_iteration,
+                                       size_t sequential_copy_ops_per_iteration)
+{
+    // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
+    constexpr size_t FF_ADDITION_COST = 4;
+    constexpr size_t FF_MULTIPLICATION_COST = 21;
+    constexpr size_t FF_INVERSION_COST = 7000;
+    constexpr size_t GE_ADDITION_COST = 350;
+    constexpr size_t GE_DOUBLING_COST = 194;
+    constexpr size_t SM_COST = 50000;
+    constexpr size_t SEQ_COPY_COST = 3;
+    // We take the maximum observed parallel_for cost (388 us) and round it up.
+    // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
+    // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
+    // workloads
+    constexpr size_t PARALLEL_FOR_COST = 400000;
+    // Get number of cpus we can split into
+    const size_t num_cpus = get_num_cpus();
+
+    // Compute the size of a single chunk
+    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
+
+    // Compute the cost of all operations done by other threads
+    const size_t offset_cost =
+        (num_points - chunk_size) *
+        (finite_field_additions_per_iteration * FF_ADDITION_COST +
+         finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST +
+         finite_field_inversions_per_iteration * FF_INVERSION_COST +
+         group_element_additions_per_iteration * GE_ADDITION_COST +
+         group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST +
+         sequential_copy_ops_per_iteration * SEQ_COPY_COST);
+
+    // If starting parallel for is longer than computing, just compute
+    if (offset_cost < PARALLEL_FOR_COST) {
+        func(0, num_points);
+        return;
+    }
+    // Parallelize over chunks
+    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
+        // If num_points is small, sometimes we need fewer CPUs
+        if (chunk_size * chunk_index > num_points) {
+            return;
+        }
+        // Compute the current chunk size (can differ in case it's the last chunk)
+        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
+        if (current_chunk_size == 0) {
+            return;
+        }
+        size_t start = chunk_index * chunk_size;
+        size_t end = chunk_index * chunk_size + current_chunk_size;
+        func(start, end);
+    });
+};
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp
index fe799e1d46f..787f0313d9e 100644
--- a/barretenberg/cpp/src/barretenberg/common/thread.hpp
+++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp
@@ -25,4 +25,13 @@ inline size_t get_num_cpus_pow2()
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
 void run_loop_in_parallel(size_t num_points,
                           const std::function<void(size_t, size_t)>& func,
-                          size_t no_multhreading_if_less_or_equal = 0);
\ No newline at end of file
+                          size_t no_multhreading_if_less_or_equal = 0);
+void run_loop_in_parallel_if_effective(size_t num_points,
+                                       const std::function<void(size_t, size_t)>& func,
+                                       size_t finite_field_additions_per_iteration = 0,
+                                       size_t finite_field_multiplications_per_iteration = 0,
+                                       size_t finite_field_inversions_per_iteration = 0,
+                                       size_t group_element_additions_per_iteration = 0,
+                                       size_t group_element_doublings_per_iteration = 0,
+                                       size_t scalar_multiplications_per_iteration = 0,
+                                       size_t sequential_copy_ops_per_iteration = 0);
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
index 420b4649856..22477f8352e 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -91,6 +91,9 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
     BBERG_INLINE constexpr bool operator==(const element& other) const noexcept;
 
     static void batch_normalize(element* elements, size_t num_elements) noexcept;
+    static void batch_affine_add(const std::span<affine_element<Fq, Fr, Params>>& first_group,
+                                 const std::span<affine_element<Fq, Fr, Params>>& second_group,
+                                 const std::span<affine_element<Fq, Fr, Params>>& results) noexcept;
     static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
         const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
 
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
index da98fb74f8f..a679fa6152c 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -688,6 +688,89 @@ element<Fq, Fr, T> element<Fq, Fr, T>::mul_with_endomorphism(const Fr& exponent)
     return work_element;
 }
 
+/**
+ * @brief Pairwise affine add points in first and second group
+ *
+ * @param first_group
+ * @param second_group
+ * @param results
+ */
+template <class Fq, class Fr, class T>
+void element<Fq, Fr, T>::batch_affine_add(const std::span<affine_element<Fq, Fr, T>>& first_group,
+                                          const std::span<affine_element<Fq, Fr, T>>& second_group,
+                                          const std::span<affine_element<Fq, Fr, T>>& results) noexcept
+{
+    typedef affine_element<Fq, Fr, T> affine_element;
+    const size_t num_points = first_group.size();
+    ASSERT(second_group.size() == first_group.size());
+
+    // Space for temporary values
+    std::vector<Fq> scratch_space(num_points);
+
+    run_loop_in_parallel_if_effective(
+        num_points,
+        [&results, &first_group](size_t start, size_t end) {
+            for (size_t i = start; i < end; i++) {
+                results[i] = first_group[i];
+            }
+        },
+        /*finite_field_additions_per_iteration=*/0,
+        /*finite_field_multiplications_per_iteration=*/0,
+        /*finite_field_inversions_per_iteration=*/0,
+        /*group_element_additions_per_iteration=*/0,
+        /*group_element_doublings_per_iteration=*/0,
+        /*scalar_multiplications_per_iteration=*/0,
+        /*sequential_copy_ops_per_iteration=*/2);
+
+    // TODO(#826): Same code as in batch mul
+    //  we can mutate rhs but NOT lhs!
+    //  output is stored in rhs
+    /**
+     * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
+     *
+     */
+    const auto batch_affine_add_chunked =
+        [](const affine_element* lhs, affine_element* rhs, const size_t point_count, Fq* personal_scratch_space) {
+            Fq batch_inversion_accumulator = Fq::one();
+
+            for (size_t i = 0; i < point_count; i += 1) {
+                personal_scratch_space[i] = lhs[i].x + rhs[i].x; // x2 + x1
+                rhs[i].x -= lhs[i].x;                            // x2 - x1
+                rhs[i].y -= lhs[i].y;                            // y2 - y1
+                rhs[i].y *= batch_inversion_accumulator;         // (y2 - y1)*accumulator_old
+                batch_inversion_accumulator *= (rhs[i].x);
+            }
+            batch_inversion_accumulator = batch_inversion_accumulator.invert();
+
+            for (size_t i = (point_count)-1; i < point_count; i -= 1) {
+                rhs[i].y *= batch_inversion_accumulator; // update accumulator
+                batch_inversion_accumulator *= rhs[i].x;
+                rhs[i].x = rhs[i].y.sqr();
+                rhs[i].x = rhs[i].x - (personal_scratch_space[i]); // x3 = lambda_squared - x2
+                                                                   // - x1
+                personal_scratch_space[i] = lhs[i].x - rhs[i].x;
+                personal_scratch_space[i] *= rhs[i].y;
+                rhs[i].y = personal_scratch_space[i] - lhs[i].y;
+            }
+        };
+
+    /**
+     * @brief Perform batch affine addition in parallel
+     *
+     */
+    const auto batch_affine_add_internal =
+        [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) {
+            run_loop_in_parallel_if_effective(
+                num_points,
+                [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
+                    batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
+                },
+                /*finite_field_additions_per_iteration=*/6,
+                /*finite_field_multiplications_per_iteration=*/6);
+        };
+    batch_affine_add_internal(&second_group[0], &results[0]);
+}
+
 /**
  * @brief Multiply each point by the same exponent
  *
@@ -708,8 +791,9 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     // Space for temporary values
     std::vector<Fq> scratch_space(num_points);
 
-    // we can mutate rhs but NOT lhs!
-    // output is stored in rhs
+    // TODO(#826): Same code as in batch add
+    //  we can mutate rhs but NOT lhs!
+    //  output is stored in rhs
     /**
      * @brief Perform point addition rhs[i]=rhs[i]+lhs[i] with batch inversion
      *
@@ -743,15 +827,16 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
      * @brief Perform batch affine addition in parallel
      *
      */
-    const auto batch_affine_add = [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs,
-                                                                                          affine_element* rhs) {
-        run_loop_in_parallel(
-            num_points,
-            [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
-                batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
-            },
-            /*no_multhreading_if_less_or_equal=*/4);
-    };
+    const auto batch_affine_add_internal =
+        [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) {
+            run_loop_in_parallel_if_effective(
+                num_points,
+                [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
+                    batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
+                },
+                /*finite_field_additions_per_iteration=*/6,
+                /*finite_field_multiplications_per_iteration=*/6);
+        };
 
     /**
      * @brief Perform point doubling lhs[i]=lhs[i]+lhs[i] with batch inversion
@@ -789,12 +874,13 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
      *
      */
     const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) {
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) {
                 batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start);
             },
-            /*no_multhreading_if_less_or_equal=*/4);
+            /*finite_field_additions_per_iteration=*/7,
+            /*finite_field_multiplications_per_iteration=*/6);
     };
     // Compute wnaf for scalar
     const Fr converted_scalar = exponent.from_montgomery_form();
@@ -804,14 +890,20 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
         affine_element result{ Fq::zero(), Fq::zero() };
         result.self_set_infinity();
         std::vector<affine_element> results(num_points);
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [&results, result](size_t start, size_t end) {
                 for (size_t i = start; i < end; ++i) {
                     results[i] = result;
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
         return results;
     }
 
@@ -824,7 +916,7 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     }
     // Initialize first etnries in lookup table
     std::vector<affine_element> temp_point_vector(num_points);
-    run_loop_in_parallel(
+    run_loop_in_parallel_if_effective(
         num_points,
         [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) {
             for (size_t i = start; i < end; ++i) {
@@ -832,20 +924,32 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                 lookup_table[0][i] = points[i];
             }
         },
-        /*no_multhreading_if_less_or_equal=*/16);
+        /*finite_field_additions_per_iteration=*/0,
+        /*finite_field_multiplications_per_iteration=*/0,
+        /*finite_field_inversions_per_iteration=*/0,
+        /*group_element_additions_per_iteration=*/0,
+        /*group_element_doublings_per_iteration=*/0,
+        /*scalar_multiplications_per_iteration=*/0,
+        /*sequential_copy_ops_per_iteration=*/2);
 
     // Construct lookup table
     batch_affine_double(&temp_point_vector[0]);
     for (size_t j = 1; j < lookup_size; ++j) {
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [j, &lookup_table](size_t start, size_t end) {
                 for (size_t i = start; i < end; ++i) {
                     lookup_table[j][i] = lookup_table[j - 1][i];
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
-        batch_affine_add(&temp_point_vector[0], &lookup_table[j][0]);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
+        batch_affine_add_internal(&temp_point_vector[0], &lookup_table[j][0]);
     }
 
     uint64_t wnaf_table[num_rounds * 2];
@@ -873,7 +977,7 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
         index = wnaf_entry & 0x0fffffffU;
         sign = static_cast<bool>((wnaf_entry >> 31) & 1);
         const bool is_odd = ((j & 1) == 1);
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start,
                                                                                               size_t end) {
@@ -891,10 +995,16 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                     }
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
+            /*finite_field_additions_per_iteration=*/1,
+            /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
     }
     // First cycle of addition
-    batch_affine_add(&temp_point_vector[0], &work_elements[0]);
+    batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     // Run through SM logic in wnaf form (excluding the skew)
     for (size_t j = 2; j < num_rounds * 2; ++j) {
         wnaf_entry = wnaf_table[j];
@@ -906,7 +1016,7 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                 batch_affine_double(&work_elements[0]);
             }
         }
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
                 for (size_t i = start; i < end; ++i) {
@@ -919,13 +1029,19 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                     temp_point_vector[i] = to_add;
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
-        batch_affine_add(&temp_point_vector[0], &work_elements[0]);
+            /*finite_field_additions_per_iteration=*/1,
+            /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
+        batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
 
     // Apply skew for the first endo scalar
     if (skew) {
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [&lookup_table, &temp_point_vector](size_t start, size_t end) {
                 for (size_t i = start; i < end; ++i) {
@@ -933,12 +1049,18 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                     temp_point_vector[i] = -lookup_table[0][i];
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
-        batch_affine_add(&temp_point_vector[0], &work_elements[0]);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/0,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
+        batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
     // Apply skew for the second endo scalar
     if (endo_skew) {
-        run_loop_in_parallel(
+        run_loop_in_parallel_if_effective(
             num_points,
             [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
                 for (size_t i = start; i < end; ++i) {
@@ -947,8 +1069,14 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                     temp_point_vector[i].x *= beta;
                 }
             },
-            /*no_multhreading_if_less_or_equal=*/16);
-        batch_affine_add(&temp_point_vector[0], &work_elements[0]);
+            /*finite_field_additions_per_iteration=*/0,
+            /*finite_field_multiplications_per_iteration=*/1,
+            /*finite_field_inversions_per_iteration=*/0,
+            /*group_element_additions_per_iteration=*/0,
+            /*group_element_doublings_per_iteration=*/0,
+            /*scalar_multiplications_per_iteration=*/0,
+            /*sequential_copy_ops_per_iteration=*/1);
+        batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
 
     return work_elements;