From 12cc9db4fe979f51c6702f9fa68529572a11ba05 Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Wed, 24 May 2023 09:45:27 +0000
Subject: [PATCH] Templatize: everything builds and links

---
 cpp/.clangd                                   |   2 -
 .../benchmark/pippenger_bench/main.cpp        |   7 +-
 .../dsl/acir_proofs/acir_proofs.cpp           |   4 +-
 .../barretenberg/ecc/curves/bn254/bn254.hpp   |   4 +-
 .../bn254/scalar_multiplication/c_bind.cpp    |  13 +-
 .../bn254/scalar_multiplication/pippenger.cpp |  44 -
 .../bn254/scalar_multiplication/pippenger.hpp |  62 --
 .../scalar_multiplication/process_buckets.cpp |  64 --
 .../scalar_multiplication/process_buckets.hpp |  12 -
 .../scalar_multiplication/runtime_states.cpp  | 212 ----
 .../scalar_multiplication/runtime_states.hpp  |  98 --
 .../scalar_multiplication.cpp                 | 946 -----------------
 .../scalar_multiplication.hpp                 | 155 ---
 .../scalar_multiplication.test.cpp            | 937 -----------------
 .../ecc/curves/grumpkin/grumpkin.hpp          |   5 +-
 .../grumpkin/scalar_multiplication/c_bind.cpp |  50 -
 .../grumpkin/scalar_multiplication/c_bind.hpp |  18 -
 .../scalar_multiplication/pippenger.cpp       |  44 -
 .../scalar_multiplication/pippenger.hpp       |  62 --
 .../scalar_multiplication/process_buckets.cpp |  64 --
 .../scalar_multiplication/process_buckets.hpp |  12 -
 .../scalar_multiplication/runtime_states.cpp  | 212 ----
 .../scalar_multiplication/runtime_states.hpp  | 100 --
 .../scalar_multiplication.cpp                 | 947 ------------------
 .../scalar_multiplication.hpp                 | 154 ---
 .../scalar_multiplication.test.cpp            | 946 -----------------
 .../curves/scalar_multiplication/c_bind.cpp   |  50 -
 .../curves/scalar_multiplication/c_bind.hpp   |  18 -
 .../scalar_multiplication/pippenger.cpp       |  38 +-
 .../scalar_multiplication/pippenger.hpp       |  19 +-
 .../scalar_multiplication/process_buckets.cpp |   4 +-
 .../scalar_multiplication/process_buckets.hpp |   4 +-
 .../scalar_multiplication/runtime_states.cpp  |  56 +-
 .../scalar_multiplication/runtime_states.hpp  |  35 +-
 .../scalar_multiplication.cpp                 | 385 ++++---
 .../scalar_multiplication.hpp                 | 149 ++-
 .../scalar_multiplication.test.cpp            | 246 +++--
 .../ecc/curves/secp256k1/secp256k1.hpp        |   5 +-
 .../ecc/curves/secp256r1/secp256r1.hpp        |   5 +-
 .../barretenberg/honk/pcs/commitment_key.hpp  |  12 +-
 cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp     |  19 +-
 .../honk/proof_system/ultra_verifier.cpp      |   4 +-
 .../honk/proof_system/verifier.cpp            |   2 +-
 .../proofs/join_split/c_bind.cpp              |   3 +-
 .../plonk/composer/composer_base.cpp          |   8 +-
 .../turbo_plonk_composer_helper.cpp           |   2 +-
 .../plonk/composer/standard_composer.cpp      |   2 +-
 .../plonk/composer/turbo_composer.cpp         |   2 +-
 .../plonk/composer/ultra_composer.cpp         |   2 +-
 .../plonk/proof_system/prover/prover.cpp      |   2 +-
 .../proof_system/proving_key/proving_key.hpp  |   9 +-
 .../plonk/proof_system/verifier/verifier.cpp  |   9 +-
 .../proof_system/verifier/verifier.test.cpp   |  12 +-
 .../permutation_widget_impl.hpp               |   4 +-
 .../random_widgets/plookup_widget_impl.hpp    |   2 +-
 .../polynomials/polynomials.bench.cpp         |  47 +-
 .../turbo_circuit_constructor.cpp             |   2 +-
 .../proof_system/work_queue/work_queue.cpp    |   6 +-
 cpp/src/barretenberg/srs/io.cpp               |  69 +-
 cpp/src/barretenberg/srs/io.hpp               |  25 +-
 .../reference_string/env_reference_string.hpp |   4 +-
 .../file_reference_string.hpp                 |   4 +-
 .../reference_string/mem_reference_string.hpp |   2 +-
 .../pippenger_reference_string.hpp            |   8 +-
 64 files changed, 725 insertions(+), 5724 deletions(-)
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp
 delete mode 100644 cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp

diff --git a/cpp/.clangd b/cpp/.clangd
index 599f23163a..06f5d0d059 100644
--- a/cpp/.clangd
+++ b/cpp/.clangd
@@ -59,8 +59,6 @@ Diagnostics:
       - readability-function-cognitive-complexity
       # It is often nicer to not be explicit
       - google-explicit-constructor
-    CheckOptions:
-      - cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: True
 
 --- # this divider is necessary
 # Disable some checks for Google Test/Bench
diff --git a/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp b/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp
index 40f2b15284..72325f78b3 100644
--- a/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp
+++ b/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp
@@ -1,7 +1,8 @@
 #include <chrono>
 #include "barretenberg/common/assert.hpp"
 #include <cstdlib>
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/bn254/bn254.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/srs/reference_string/file_reference_string.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 
@@ -63,9 +64,9 @@ const auto init = []() {
 
 int pippenger()
 {
-    scalar_multiplication::pippenger_runtime_state state(NUM_POINTS);
+    scalar_multiplication::pippenger_runtime_state<curve::BN254> state(NUM_POINTS);
     std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now();
-    g1::element result = scalar_multiplication::pippenger_unsafe(
+    g1::element result = scalar_multiplication::pippenger_unsafe<curve::BN254>(
         &scalars[0], reference_string->get_monomial_points(), NUM_POINTS, state);
     std::chrono::steady_clock::time_point time_end = std::chrono::steady_clock::now();
     std::chrono::microseconds diff = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start);
diff --git a/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp b/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp
index 2010c592fb..a987a66d28 100644
--- a/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp
+++ b/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp
@@ -72,7 +72,7 @@ size_t init_verification_key(void* pippenger, uint8_t const* g2x, uint8_t const*
     auto proving_key = std::make_shared<plonk::proving_key>(std::move(pk_data), crs);
 
     auto crs_factory = std::make_unique<PippengerReferenceStringFactory>(
-        reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger), g2x);
+        reinterpret_cast<scalar_multiplication::Pippenger<curve::BN254>*>(pippenger), g2x);
     proving_key->reference_string = crs_factory->get_prover_crs(proving_key->circuit_size);
 
     acir_format::Composer composer(proving_key, nullptr);
@@ -108,7 +108,7 @@ size_t new_proof(void* pippenger,
     auto witness = from_buffer<std::vector<fr>>(witness_buf);
 
     auto crs_factory = std::make_unique<PippengerReferenceStringFactory>(
-        reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger), g2x);
+        reinterpret_cast<scalar_multiplication::Pippenger<curve::BN254>*>(pippenger), g2x);
     proving_key->reference_string = crs_factory->get_prover_crs(proving_key->circuit_size);
 
     acir_format::Composer composer(proving_key, nullptr);
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp b/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp
index 55fae2d742..7302507e7e 100644
--- a/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp
+++ b/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp
@@ -2,13 +2,15 @@
 #include "../bn254/fr.hpp"
 #include "../bn254/fq.hpp"
 #include "../bn254/g1.hpp"
+#include "../bn254/g2.hpp"
 
 namespace curve {
 class BN254 {
   public:
     using ScalarField = barretenberg::fr;
     using BaseField = barretenberg::fq;
-    using ProjectiveElement = typename barretenberg::g1::element;
+    using Group = typename barretenberg::g1;
+    using Element = typename barretenberg::g1::element;
     using AffineElement = typename barretenberg::g1::affine_element;
 };
 } // namespace curve
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp
index 66803cb5d5..74275eb514 100644
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp
+++ b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp
@@ -1,5 +1,6 @@
-#include "scalar_multiplication.hpp"
-#include "pippenger.hpp"
+#include "../bn254.hpp"
+#include "../../scalar_multiplication/scalar_multiplication.hpp"
+#include "../../scalar_multiplication/pippenger.hpp"
 #include "barretenberg/common/mem.hpp"
 
 using namespace barretenberg;
@@ -21,19 +22,19 @@ WASM_EXPORT void bbfree(void* ptr)
 
 WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points)
 {
-    auto ptr = new scalar_multiplication::Pippenger(points, num_points);
+    auto ptr = new scalar_multiplication::Pippenger<curve::BN254>(points, num_points);
     return ptr;
 }
 
 WASM_EXPORT void delete_pippenger(void* pippenger)
 {
-    delete reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger);
+    delete reinterpret_cast<scalar_multiplication::Pippenger<curve::BN254>*>(pippenger);
 }
 
 WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* result_ptr)
 {
-    scalar_multiplication::pippenger_runtime_state state(range);
-    auto pippenger = reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger_ptr);
+    scalar_multiplication::pippenger_runtime_state<curve::BN254> state(range);
+    auto pippenger = reinterpret_cast<scalar_multiplication::Pippenger<curve::BN254>*>(pippenger_ptr);
     auto scalars = reinterpret_cast<fr*>(scalars_ptr);
     auto result = reinterpret_cast<g1::element*>(result_ptr);
     *result = pippenger->pippenger_unsafe(scalars, from, range);
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp
deleted file mode 100644
index cb8f93a6c3..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "pippenger.hpp"
-#include "barretenberg/srs/io.hpp"
-namespace barretenberg {
-namespace scalar_multiplication {
-
-Pippenger::Pippenger(g1::affine_element* points, size_t num_points)
-    : monomials_(points)
-    , num_points_(num_points)
-{
-    io::byteswap(&monomials_[0], num_points * 64);
-    scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-Pippenger::Pippenger(uint8_t const* points, size_t num_points)
-    : num_points_(num_points)
-{
-    monomials_ = point_table_alloc<g1::affine_element>(num_points);
-
-    barretenberg::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64);
-    barretenberg::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-Pippenger::Pippenger(std::string const& path, size_t num_points)
-    : num_points_(num_points)
-{
-    monomials_ = point_table_alloc<g1::affine_element>(num_points);
-
-    barretenberg::io::read_transcript_g1(monomials_, num_points, path);
-    barretenberg::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range)
-{
-    scalar_multiplication::pippenger_runtime_state state(range);
-    return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state);
-}
-
-Pippenger::~Pippenger()
-{
-    free(monomials_);
-}
-
-} // namespace scalar_multiplication
-} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp
deleted file mode 100644
index 48a2c133f6..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#pragma once
-#include "./scalar_multiplication.hpp"
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-namespace barretenberg {
-namespace scalar_multiplication {
-
-inline size_t point_table_size(size_t num_points)
-{
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t prefetch_overflow = 16 * num_threads;
-
-    return 2 * num_points + prefetch_overflow;
-}
-
-template <typename T> inline size_t point_table_buf_size(size_t num_points)
-{
-    return sizeof(T) * point_table_size(num_points);
-}
-
-template <typename T> inline T* point_table_alloc(size_t num_points)
-{
-    return (T*)aligned_alloc(64, point_table_buf_size<T>(num_points));
-}
-
-class Pippenger {
-  public:
-    /**
-     * Expects points to be buffer of size as per point_table_size().
-     * It expects the crs to start at points[1], and it fills in affine_one at points[0].
-     * The crs undergoes a byteswap, and then the point table is generated.
-     */
-    Pippenger(g1::affine_element* points, size_t num_points);
-
-    Pippenger(uint8_t const* points, size_t num_points);
-
-    Pippenger(std::string const& path, size_t num_points);
-
-    ~Pippenger();
-
-    g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range);
-
-    g1::affine_element* get_point_table() const { return monomials_; }
-
-    size_t get_num_points() const { return num_points_; }
-
-  private:
-    g1::affine_element* monomials_;
-    size_t num_points_;
-};
-
-} // namespace scalar_multiplication
-} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp
deleted file mode 100644
index 01f92b8673..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "process_buckets.hpp"
-
-#include <array>
-
-namespace barretenberg {
-namespace scalar_multiplication {
-void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept
-{
-    constexpr size_t num_bits = 8;
-    constexpr size_t num_buckets = 1UL << num_bits;
-    constexpr uint32_t mask = static_cast<uint32_t>(num_buckets) - 1U;
-    std::array<uint32_t, num_buckets> bucket_counts{};
-
-    for (size_t i = 0; i < num_entries; ++i) {
-        bucket_counts[(keys[i] >> shift) & mask]++;
-    }
-
-    std::array<uint32_t, num_buckets + 1> offsets;
-    std::array<uint32_t, num_buckets + 1> offsets_copy;
-    offsets[0] = 0;
-
-    for (size_t i = 0; i < num_buckets - 1; ++i) {
-        bucket_counts[i + 1] += bucket_counts[i];
-    }
-    for (size_t i = 1; i < num_buckets + 1; ++i) {
-        offsets[i] = bucket_counts[i - 1];
-    }
-    for (size_t i = 0; i < num_buckets + 1; ++i) {
-        offsets_copy[i] = offsets[i];
-    }
-    uint64_t* start = &keys[0];
-
-    for (size_t i = 0; i < num_buckets; ++i) {
-        uint64_t* bucket_start = &keys[offsets[i]];
-        const uint64_t* bucket_end = &keys[offsets_copy[i + 1]];
-        while (bucket_start != bucket_end) {
-            for (uint64_t* it = bucket_start; it < bucket_end; ++it) {
-                const size_t value = (*it >> shift) & mask;
-                const uint64_t offset = offsets[value]++;
-                std::iter_swap(it, start + offset);
-            }
-            bucket_start = &keys[offsets[i]];
-        }
-    }
-    if (shift > 0) {
-        for (size_t i = 0; i < num_buckets; ++i) {
-            if (offsets_copy[i + 1] - offsets_copy[i] > 1) {
-                radix_sort(&keys[offsets_copy[i]], offsets_copy[i + 1] - offsets_copy[i], shift - 8);
-            }
-        }
-    }
-}
-
-void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept
-{
-    const uint32_t bits_per_round = 8;
-    const uint32_t base = num_bits & 7;
-    const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8;
-    const uint32_t shift = total_bits - bits_per_round;
-
-    radix_sort(wnaf_entries, num_entries, shift);
-}
-} // namespace scalar_multiplication
-} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp
deleted file mode 100644
index bde5916663..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace barretenberg {
-namespace scalar_multiplication {
-void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept;
-
-void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept;
-} // namespace scalar_multiplication
-} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp
deleted file mode 100644
index 6e8aa5ccd8..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-#include "runtime_states.hpp"
-
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-#include "barretenberg/numeric/bitop/get_msb.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-namespace barretenberg {
-namespace scalar_multiplication {
-
-pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points)
-{
-    constexpr size_t MAX_NUM_ROUNDS = 256;
-    num_points = num_initial_points * 2;
-    const size_t num_points_floor = static_cast<size_t>(1ULL << (numeric::get_msb(num_points)));
-    const size_t num_buckets = static_cast<size_t>(
-        1U << barretenberg::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_initial_points)));
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t prefetch_overflow = 16 * num_threads;
-    const size_t num_rounds =
-        static_cast<size_t>(barretenberg::scalar_multiplication::get_num_rounds(static_cast<size_t>(num_points_floor)));
-    point_schedule = (uint64_t*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t)));
-    skew_table = (bool*)(aligned_alloc(64, pad(static_cast<size_t>(num_points) * sizeof(bool), 64)));
-    point_pairs_1 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    point_pairs_2 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    scratch_space = (fq*)(aligned_alloc(64, static_cast<size_t>(num_points) * sizeof(g1::affine_element)));
-    bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
-    bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
-    bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool)));
-    round_counts = (uint64_t*)(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t)));
-
-    const size_t points_per_thread = static_cast<size_t>(num_points) / num_threads;
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_threads; ++i) {
-        const size_t thread_offset = i * points_per_thread;
-        memset((void*)(point_pairs_1 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(point_pairs_2 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq));
-        for (size_t j = 0; j < num_rounds; ++j) {
-            const size_t round_offset = (j * static_cast<size_t>(num_points));
-            memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t));
-        }
-        memset((void*)(skew_table + thread_offset), 0, points_per_thread * sizeof(bool));
-    }
-
-    memset((void*)bucket_counts, 0, num_threads * num_buckets * sizeof(uint32_t));
-    memset((void*)bit_counts, 0, num_threads * num_buckets * sizeof(uint32_t));
-    memset((void*)bucket_empty_status, 0, num_threads * num_buckets * sizeof(bool));
-    memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t));
-}
-
-pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other)
-{
-    point_schedule = other.point_schedule;
-    skew_table = other.skew_table;
-    point_pairs_1 = other.point_pairs_1;
-    point_pairs_2 = other.point_pairs_2;
-    scratch_space = other.scratch_space;
-    bit_counts = other.bit_counts;
-    bucket_counts = other.bucket_counts;
-    bucket_empty_status = other.bucket_empty_status;
-    round_counts = other.round_counts;
-
-    other.point_schedule = nullptr;
-    other.skew_table = nullptr;
-    other.point_pairs_1 = nullptr;
-    other.point_pairs_2 = nullptr;
-    other.scratch_space = nullptr;
-    other.bit_counts = nullptr;
-    other.bucket_counts = nullptr;
-    other.bucket_empty_status = nullptr;
-    other.round_counts = nullptr;
-
-    num_points = other.num_points;
-}
-
-pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other)
-{
-    if (point_schedule) {
-        aligned_free(point_schedule);
-    }
-
-    if (skew_table) {
-        aligned_free(skew_table);
-    }
-
-    if (point_pairs_1) {
-        aligned_free(point_pairs_1);
-    }
-
-    if (point_pairs_2) {
-        aligned_free(point_pairs_2);
-    }
-
-    if (scratch_space) {
-        aligned_free(scratch_space);
-    }
-
-    if (bit_counts) {
-        aligned_free(bit_counts);
-    }
-
-    if (bucket_counts) {
-        aligned_free(bucket_counts);
-    }
-
-    if (bucket_empty_status) {
-        aligned_free(bucket_empty_status);
-    }
-
-    if (round_counts) {
-        aligned_free(round_counts);
-    }
-
-    point_schedule = other.point_schedule;
-    skew_table = other.skew_table;
-    point_pairs_1 = other.point_pairs_1;
-    point_pairs_2 = other.point_pairs_2;
-    scratch_space = other.scratch_space;
-    bit_counts = other.bit_counts;
-    bucket_counts = other.bucket_counts;
-    bucket_empty_status = other.bucket_empty_status;
-    round_counts = other.round_counts;
-
-    other.point_schedule = nullptr;
-    other.skew_table = nullptr;
-    other.point_pairs_1 = nullptr;
-    other.point_pairs_2 = nullptr;
-    other.scratch_space = nullptr;
-    other.bit_counts = nullptr;
-    other.bucket_counts = nullptr;
-    other.bucket_empty_status = nullptr;
-    other.round_counts = nullptr;
-
-    num_points = other.num_points;
-    return *this;
-}
-
-affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads,
-                                                                                       const size_t thread_index)
-{
-    const size_t points_per_thread = static_cast<size_t>(num_points / num_threads);
-    const size_t num_buckets = static_cast<size_t>(
-        1U << barretenberg::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
-
-    scalar_multiplication::affine_product_runtime_state product_state;
-
-    product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16);
-    product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16);
-    product_state.scratch_space = scratch_space + (thread_index * (points_per_thread / 2));
-    product_state.bucket_counts = bucket_counts + (thread_index * (num_buckets));
-    product_state.bit_offsets = bit_counts + (thread_index * (num_buckets));
-    product_state.bucket_empty_status = bucket_empty_status + (thread_index * (num_buckets));
-    return product_state;
-}
-
-pippenger_runtime_state::~pippenger_runtime_state()
-{
-    if (point_schedule) {
-        aligned_free(point_schedule);
-    }
-
-    if (skew_table) {
-        aligned_free(skew_table);
-    }
-
-    if (point_pairs_1) {
-        aligned_free(point_pairs_1);
-    }
-
-    if (point_pairs_2) {
-        aligned_free(point_pairs_2);
-    }
-
-    if (scratch_space) {
-        aligned_free(scratch_space);
-    }
-
-    if (bit_counts) {
-        aligned_free(bit_counts);
-    }
-
-    if (bucket_counts) {
-        aligned_free(bucket_counts);
-    }
-
-    if (bucket_empty_status) {
-        aligned_free(bucket_empty_status);
-    }
-
-    if (round_counts) {
-        aligned_free(round_counts);
-    }
-}
-} // namespace scalar_multiplication
-} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp
deleted file mode 100644
index 14c62eb089..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-
-#include "../g1.hpp"
-
-namespace barretenberg {
-// simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm.
-// This is to eliminate page faults when allocating (and writing) to large tranches of memory.
-namespace scalar_multiplication {
-constexpr size_t get_optimal_bucket_width(const size_t num_points)
-{
-    if (num_points >= 14617149) {
-        return 21;
-    }
-    if (num_points >= 1139094) {
-        return 18;
-    }
-    // if (num_points >= 100000)
-    if (num_points >= 155975) {
-        return 15;
-    }
-    if (num_points >= 144834)
-    // if (num_points >= 100000)
-    {
-        return 14;
-    }
-    if (num_points >= 25067) {
-        return 12;
-    }
-    if (num_points >= 13926) {
-        return 11;
-    }
-    if (num_points >= 7659) {
-        return 10;
-    }
-    if (num_points >= 2436) {
-        return 9;
-    }
-    if (num_points >= 376) {
-        return 7;
-    }
-    if (num_points >= 231) {
-        return 6;
-    }
-    if (num_points >= 97) {
-        return 5;
-    }
-    if (num_points >= 35) {
-        return 4;
-    }
-    if (num_points >= 10) {
-        return 3;
-    }
-    if (num_points >= 2) {
-        return 2;
-    }
-    return 1;
-}
-
-constexpr size_t get_num_rounds(const size_t num_points)
-{
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-    return WNAF_SIZE(bits_per_bucket + 1);
-}
-
-struct affine_product_runtime_state {
-    g1::affine_element* points;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
-    uint32_t* bucket_counts;
-    uint32_t* bit_offsets;
-    uint64_t* point_schedule;
-    uint32_t num_points;
-    uint32_t num_buckets;
-    bool* bucket_empty_status;
-};
-
-struct pippenger_runtime_state {
-    uint64_t* point_schedule;
-    bool* skew_table;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
-    uint32_t* bucket_counts;
-    uint32_t* bit_counts;
-    bool* bucket_empty_status;
-    uint64_t* round_counts;
-    uint64_t num_points;
-
-    pippenger_runtime_state(const size_t num_initial_points);
-    pippenger_runtime_state(pippenger_runtime_state&& other);
-    pippenger_runtime_state& operator=(pippenger_runtime_state&& other);
-    ~pippenger_runtime_state();
-
-    affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index);
-};
-} // namespace scalar_multiplication
-} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp
deleted file mode 100644
index 8ff1782f1e..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp
+++ /dev/null
@@ -1,946 +0,0 @@
-#include "./scalar_multiplication.hpp"
-
-#include "barretenberg/common/throw_or_abort.hpp"
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-#include "barretenberg/numeric/bitop/get_msb.hpp"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-
-#include "../../../groups/wnaf.hpp"
-#include "../fq.hpp"
-#include "../fr.hpp"
-#include "../g1.hpp"
-#include "./process_buckets.hpp"
-#include "./runtime_states.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-#define BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK                                                                        \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 16] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 17] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 18] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 19] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 20] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 21] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 22] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 23] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 24] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 25] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 26] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 27] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 28] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 29] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 30] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 31] >> 32ULL));                              \
-                                                                                                                       \
-    uint64_t schedule_a = state.point_schedule[schedule_it];                                                           \
-    uint64_t schedule_b = state.point_schedule[schedule_it + 1];                                                       \
-    uint64_t schedule_c = state.point_schedule[schedule_it + 2];                                                       \
-    uint64_t schedule_d = state.point_schedule[schedule_it + 3];                                                       \
-    uint64_t schedule_e = state.point_schedule[schedule_it + 4];                                                       \
-    uint64_t schedule_f = state.point_schedule[schedule_it + 5];                                                       \
-    uint64_t schedule_g = state.point_schedule[schedule_it + 6];                                                       \
-    uint64_t schedule_h = state.point_schedule[schedule_it + 7];                                                       \
-    uint64_t schedule_i = state.point_schedule[schedule_it + 8];                                                       \
-    uint64_t schedule_j = state.point_schedule[schedule_it + 9];                                                       \
-    uint64_t schedule_k = state.point_schedule[schedule_it + 10];                                                      \
-    uint64_t schedule_l = state.point_schedule[schedule_it + 11];                                                      \
-    uint64_t schedule_m = state.point_schedule[schedule_it + 12];                                                      \
-    uint64_t schedule_n = state.point_schedule[schedule_it + 13];                                                      \
-    uint64_t schedule_o = state.point_schedule[schedule_it + 14];                                                      \
-    uint64_t schedule_p = state.point_schedule[schedule_it + 15];                                                      \
-                                                                                                                       \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL);     \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 10,                                           \
-                                  (schedule_k >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 11,                                           \
-                                  (schedule_l >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 12,                                           \
-                                  (schedule_m >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 13,                                           \
-                                  (schedule_n >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 14,                                           \
-                                  (schedule_o >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 15,                                           \
-                                  (schedule_p >> 31ULL) & 1ULL);                                                       \
-                                                                                                                       \
-    current_offset += 16;                                                                                              \
-    schedule_it += 16;
-
-namespace barretenberg {
-namespace scalar_multiplication {
-
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points)
-{
-    // iterate backwards, so that `points` and `table` can point to the same memory location
-    fq beta = fq::cube_root_of_unity();
-    for (size_t i = num_points - 1; i < num_points; --i) {
-        table[i * 2] = points[i];
-        table[i * 2 + 1].x = beta * points[i].x;
-        table[i * 2 + 1].y = -points[i].y;
-    }
-}
-
-/**
- * Compute the windowed-non-adjacent-form versions of our scalar multipliers.
- *
- * We start by splitting our 254 bit scalars into 2 127-bit scalars, using the short weierstrass curve endomorphism
- * (for a point P \in \G === (x, y) \in \Fq, then (\beta x, y) = (\lambda) * P , where \beta = 1^{1/3} mod Fq and
- *\lambda = 1^{1/3} mod Fr) (which means we can represent a scalar multiplication (k * P) as (k1 * P + k2 * \lambda *
- *P), where k1, k2 have 127 bits) (see field::split_into_endomorphism_scalars for more details)
- *
- * Once we have our 127-bit scalar multipliers, we determine the optimal number of pippenger rounds, given the number of
- *points we're multiplying. Once we have the number of rounds, `m`, we need to split our scalar into `m` bit-slices.
- *Each pippenger round will work on one bit-slice.
- *
- * Pippenger's algorithm works by, for each round, iterating over the points we're multplying. For each point, we
- *examing the point's scalar multiplier and extract the bit-slice associated with the current pippenger round (we start
- *with the most significant slice). We then use the bit-slice to index a 'bucket', which we add the point into. For
- *example, if the bit slice is 01101, we add the corresponding point into bucket[13].
- *
- * At the end of each pippenger round we concatenate the buckets together. E.g. if we have 8 buckets, we compute:
- * sum = bucket[0] + 2 * bucket[1] + 3 * bucket[2] + 4 * bucket[3] + 5 * bucket[4] + 6 * bucket[5] + 7 * bucket[6] + 8 *
- *bucket[7].
- *
- * At the end of each pippenger round, the bucket sum will contain the scalar multiplication result for one bit slice.
- * For example, say we have 16 rounds, where each bit slice contains 8 bits (8 * 16 = 128, enough to represent our 127
- *bit scalars). At the end of the first round, we will have taken the 8 most significant bits from every scalar
- *multiplier. Our bucket sum will be the result of a mini-scalar-multiplication, where we have multiplied every point by
- *the 8 most significant bits of each point's scalar multiplier.
- *
- * We repeat this process for every pippenger round. In our example, this gives us 16 bucket sums.
- * We need to multiply the most significant bucket sum by 2^{120}, the second most significant bucket sum by 2^{112}
- *etc. Once this is done we can add the bucket sums together, to evaluate our scalar multiplication result.
- *
- * Pippenger has complexity O(n / logn), because of two factors at play: the number of buckets we need to concatenate
- *per round, and the number of points we need to add into buckets per round.
- *
- * To minimize the number of point additions per round, we want fewer rounds. But fewer rounds increases the number of
- *bucket concatenations. The more points we have, the greater the time saving when reducing the number of rounds, which
- *means we can afford to have more buckets per round.
- *
- * For a concrete example, with 2^20 points, the sweet spot is 2^15 buckets - with 2^15 buckets we can evaluate our 127
- *bit scalar multipliers in 8 rounds (we can represent b-bit windows with 2^{b-1} buckets, more on that below).
- *
- * This means that, for each round, we add 2^21 points into buckets (we've split our scalar multpliers into two
- *half-width multipliers, so each round has twice the number of points. This is the reason why the endormorphism is
- *useful here; without the endomorphism, we would need twice the number of buckets for each round).
- *
- * We also concatenate 2^15 buckets for each round. This requires 2^16 point additions.
- *
- * Meaning that the total number of point additions is (8 * 2^21) + (8 * 2^16) = 33 * 2^19 ~ 2^24 point additions.
- * If we were to use a simple Montgomery double-and-add ladder to exponentiate each point, we would need 2^27 point
- *additions (each scalar multiplier has ~2^7 non-zero bits, and there are 2^20 points).
- *
- * This makes pippenger 8 times faster than the naive O(n) equivalent. Given that a circuit with 1 million gates will
- *require 9 multiple-scalar-multiplications with 2^20 points, efficiently using Pippenger's algorithm is essential for
- *fast provers
- *
- * One additional efficiency gain is the use of 2^{b-1} buckets to represent b bits. To do this we represent our
- *bit-slices in non-adjacent form. Non-adjacent form represents values using a base, where each 'bit' can take the
- *values (-1, 0, 1). This is considerably more efficient than binary form for scalar multiplication, as inverting a
- *point can be done by negating the y-coordinate.
- *
- * We actually use a slightly different representation than simple non-adjacent form. To represent b bits, a bit slice
- *contains values from (-2^{b} - 1, ..., -1, 1, ..., 2^{b} - 1). i.e. we only have odd values. We do this to eliminate
- *0-valued windows, as having a conditional branch in our hot loop to check if an entry is 0 is somethin we want to
- *avoid.
- *
- * The above representation can be used to represent any binary number as long as we add a 'skew' factor. Each scalar
- *multiplier's `skew` tracks if the scalar multiplier is even or odd. If it's even, `skew = true`, and we add `1` to our
- *multiplier to make it odd.
- *
- * We then, at the end of the Pippenger algorithm, subtract a point from the total result, if that point's skew is
- *`true`.
- *
- * At the end of `compute_wnaf_states`, `state.wnaf_table` will contain our wnaf entries, but unsorted.
- *
- * @param point_schedule Pointer to the output array with all WNAFs
- * @param input_skew_table Pointer to the output array with all skews
- * @param round_counts The number of points in each round
- * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF
- * @param num_initial_points The number of points before the endomorphism split
- **/
-void compute_wnaf_states(uint64_t* point_schedule,
-                         bool* input_skew_table,
-                         uint64_t* round_counts,
-                         const fr* scalars,
-                         const size_t num_initial_points)
-{
-    const size_t num_points = num_initial_points * 2;
-    constexpr size_t MAX_NUM_ROUNDS = 256;
-    constexpr size_t MAX_NUM_THREADS = 128;
-    const size_t num_rounds = get_num_rounds(num_points);
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_initial_points);
-    const size_t wnaf_bits = bits_per_bucket + 1;
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t num_initial_points_per_thread = num_initial_points / num_threads;
-    const size_t num_points_per_thread = num_points / num_threads;
-    std::array<std::array<uint64_t, MAX_NUM_ROUNDS>, MAX_NUM_THREADS> thread_round_counts;
-    for (size_t i = 0; i < num_threads; ++i) {
-        for (size_t j = 0; j < num_rounds; ++j) {
-            thread_round_counts[i][j] = 0;
-        }
-    }
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_threads; ++i) {
-        fr T0;
-        uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread];
-        const fr* thread_scalars = &scalars[i * num_initial_points_per_thread];
-        bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread];
-        uint64_t offset = i * num_points_per_thread;
-
-        for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) {
-            T0 = thread_scalars[j].from_montgomery_form();
-            fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]);
-
-            wnaf::fixed_wnaf_with_counts(&T0.data[0],
-                                         &wnaf_table[(j << 1UL)],
-                                         skew_table[j << 1ULL],
-                                         &thread_round_counts[i][0],
-                                         ((j << 1ULL) + offset) << 32ULL,
-                                         num_points,
-                                         wnaf_bits);
-            wnaf::fixed_wnaf_with_counts(&T0.data[2],
-                                         &wnaf_table[(j << 1UL) + 1],
-                                         skew_table[(j << 1UL) + 1],
-                                         &thread_round_counts[i][0],
-                                         ((j << 1UL) + offset + 1) << 32UL,
-                                         num_points,
-                                         wnaf_bits);
-        }
-    }
-
-    for (size_t i = 0; i < num_rounds; ++i) {
-        round_counts[i] = 0;
-    }
-    for (size_t i = 0; i < num_threads; ++i) {
-        for (size_t j = 0; j < num_rounds; ++j) {
-            round_counts[j] += thread_round_counts[i][j];
-        }
-    }
-}
-
-/**
- *  Sorts our wnaf entries in increasing bucket order (per round).
- *  We currently don't multi-thread the inner sorting algorithm, and just split our threads over the number of rounds.
- *  A multi-threaded sorting algorithm could be more efficient, but the total runtime of `organize_buckets` is <5% of
- *  pippenger's runtime, so not a priority.
- **/
-void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t num_points)
-{
-    const size_t num_rounds = get_num_rounds(num_points);
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_rounds; ++i) {
-        scalar_multiplication::process_buckets(&point_schedule[i * num_points],
-                                               num_points,
-                                               static_cast<uint32_t>(get_optimal_bucket_width(num_points / 2)) + 1);
-    }
-}
-
-/**
- * adds a bunch of points together using affine addition formulae.
- * Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to perform.
- * Affine formula:
- *
- * \lambda = (y_2 - y_1) / (x_2 - x_1)
- * x_3 = \lambda^2 - (x_2 + x_1)
- * y_3 = \lambda*(x_1 - x_3) - y_1
- *
- * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse,
- * which is outrageously expensive.
- *
- * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0.
- *
- * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \}
- * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products.
- * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n]
- * We then compute a single inverse: I = 1 / x_1x_2...x_n
- * Finally, we can use our accumulated products, to quotient out individual inverses.
- * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1)
- * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we
- * require.
- *
- * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse.
- * Which means we can compute a point addition with SIX field multiplications in total.
- * The traditional Jacobian-coordinate formula requires 11.
- *
- * There is a catch though - we need large sequences of independent point additions!
- * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the sequence.
- *
- * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated
- **/
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space)
-{
-    fq batch_inversion_accumulator = fq::one();
-
-    for (size_t i = 0; i < num_points; i += 2) {
-        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
-        points[i + 1].x -= points[i].x;                        // x2 - x1
-        points[i + 1].y -= points[i].y;                        // y2 - y1
-        points[i + 1].y *= batch_inversion_accumulator;        // (y2 - y1)*accumulator_old
-        batch_inversion_accumulator *= (points[i + 1].x);
-    }
-
-    if (batch_inversion_accumulator == 0) {
-        throw_or_abort("attempted to invert zero in add_affine_points");
-    } else {
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-    }
-
-    for (size_t i = (num_points)-2; i < num_points; i -= 2) {
-        // Memory bandwidth is a bit of a bottleneck here.
-        // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching
-        __builtin_prefetch(points + i - 2);
-        __builtin_prefetch(points + i - 1);
-        __builtin_prefetch(points + ((i + num_points - 2) >> 1));
-        __builtin_prefetch(scratch_space + ((i - 2) >> 1));
-
-        points[i + 1].y *= batch_inversion_accumulator; // update accumulator
-        batch_inversion_accumulator *= points[i + 1].x;
-        points[i + 1].x = points[i + 1].y.sqr();
-        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
-                                                                                     // - x1
-        points[i].x -= points[(i + num_points) >> 1].x;
-        points[i].x *= points[i + 1].y;
-        points[(i + num_points) >> 1].y = points[i].x - points[i].y;
-    }
-}
-
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space)
-{
-    fq batch_inversion_accumulator = fq::one();
-
-    for (size_t i = 0; i < num_points; i += 2) {
-        if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) {
-            continue;
-        }
-        if (points[i].x == points[i + 1].x) {
-            if (points[i].y == points[i + 1].y) {
-                // double
-                scratch_space[i >> 1] = points[i].x + points[i].x; // 2x
-                fq x_squared = points[i].x.sqr();
-                points[i + 1].x = points[i].y + points[i].y;         // 2y
-                points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2
-                points[i + 1].y *= batch_inversion_accumulator;
-                batch_inversion_accumulator *= (points[i + 1].x);
-                continue;
-            }
-            points[i].self_set_infinity();
-            points[i + 1].self_set_infinity();
-            continue;
-        }
-
-        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
-        points[i + 1].x -= points[i].x;                        // x2 - x1
-        points[i + 1].y -= points[i].y;                        // y2 - y1
-        points[i + 1].y *= batch_inversion_accumulator;        // (y2 - y1)*accumulator_old
-        batch_inversion_accumulator *= (points[i + 1].x);
-    }
-    if (!batch_inversion_accumulator.is_zero()) {
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-    }
-    for (size_t i = (num_points)-2; i < num_points; i -= 2) {
-        // Memory bandwidth is a bit of a bottleneck here.
-        // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching
-        __builtin_prefetch(points + i - 2);
-        __builtin_prefetch(points + i - 1);
-        __builtin_prefetch(points + ((i + num_points - 2) >> 1));
-        __builtin_prefetch(scratch_space + ((i - 2) >> 1));
-
-        if (points[i].is_point_at_infinity()) {
-            points[(i + num_points) >> 1] = points[i + 1];
-            continue;
-        }
-        if (points[i + 1].is_point_at_infinity()) {
-            points[(i + num_points) >> 1] = points[i];
-            continue;
-        }
-
-        points[i + 1].y *= batch_inversion_accumulator; // update accumulator
-        batch_inversion_accumulator *= points[i + 1].x;
-        points[i + 1].x = points[i + 1].y.sqr();
-        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
-                                                                                     // - x1
-        points[i].x -= points[(i + num_points) >> 1].x;
-        points[i].x *= points[i + 1].y;
-        points[(i + num_points) >> 1].y = points[i].x - points[i].y;
-    }
-}
-
-/**
- * evaluate a chain of pairwise additions.
- * The additions are sequenced into base-2 segments
- * i.e. pairs, pairs of pairs, pairs of pairs of pairs etc
- * `max_bucket_bits` indicates the largest set of nested pairs in the array,
- * which defines the iteration depth
- **/
-void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases)
-{
-    size_t end = state.num_points;
-    size_t start = 0;
-    for (size_t i = 0; i < max_bucket_bits; ++i) {
-        const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i);
-        start = end - points_in_round;
-        if (handle_edge_cases) {
-            add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space);
-        } else {
-            add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space);
-        }
-    }
-}
-
-/**
- * This is the entry point for our 'find a way of evaluating a giant multi-product using affine coordinates' algorithm
- * By this point, we have already sorted our pippenger buckets. So we have the following situation:
- *
- * 1. We have a defined number of buckets points
- * 2. We have a defined number of points, that need to be added into these bucket points
- * 3. number of points >> number of buckets
- *
- * The algorithm begins by counting the number of points assigned to each bucket.
- * For each bucket, we then take this count and split it into its base-2 components.
- * e.g. if bucket[3] has 14 points, we split that into a sequence of (8, 4, 2)
- * This base-2 splitting is useful, because we can take the bucket's associated points, and
- * sort them into pairs, quads, octs etc. These mini-addition sequences are independent from one another,
- * which means that we can use the affine trick to evaluate them.
- * Once we're done, we have effectively reduced the number of points in the bucket to a logarithmic factor of the input.
- * e.g. in the above example, once we've evaluated our pairwise addition of 8, 4 and 2 elements,
- *      we're left with 3 points.
- * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points.
- * We repeat this process until every bucket only has one point assigned to it.
- **/
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases)
-{
-
-    // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now();
-    // This method sorts our points into our required base-2 sequences.
-    // `max_bucket_bits` is log2(maximum bucket count).
-    // This sets the upper limit on how many iterations we need to perform in `evaluate_addition_chains`.
-    // e.g. if `max_bucket_bits == 3`, then we have at least one bucket with >= 8 points in it.
-    // which means we need to repeat our pairwise addition algorithm 3 times
-    // (e.g. add 4 pairs together to get 2 pairs, add those pairs together to get a single pair, which we add to reduce
-    // to our final point)
-    const size_t max_bucket_bits = construct_addition_chains(state, first_round);
-
-    // if max_bucket_bits is 0, we're done! we can return
-    if (max_bucket_bits == 0) {
-        return state.point_pairs_1;
-    }
-
-    // compute our required additions using the affine trick
-    evaluate_addition_chains(state, max_bucket_bits, handle_edge_cases);
-
-    // this next step is a processing step, that computes a new point schedule for our reduced points.
-    // In the pippenger algorithm, we use a 64-bit uint to categorize each point.
-    // The high 32 bits describes the position of the point in a point array.
-    // The low 31 bits describes the bucket index that the point maps to
-    // The 32nd bit defines whether the point is actually a negation of our stored point.
-
-    // We want to compute these 'point schedule' uints for our reduced points, so that we can recurse back into
-    // `reduce_buckets`
-    uint32_t start = 0;
-    const uint32_t end = static_cast<uint32_t>(state.num_points);
-    // The output of `evaluate_addition_chains` has a bit of an odd structure, should probably refactor.
-    // Effectively, we used to have one big 1d array, and the act of computing these pair-wise point additions
-    // has chopped it up into sequences of smaller 1d arrays, with gaps in between
-    for (size_t i = 0; i < max_bucket_bits; ++i) {
-        const uint32_t points_in_round =
-            (static_cast<uint32_t>(state.num_points) - state.bit_offsets[i + 1]) >> static_cast<uint32_t>(i);
-        const uint32_t points_removed = points_in_round / 2;
-
-        start = end - points_in_round;
-        const uint32_t modified_start = start + points_removed;
-        state.bit_offsets[i + 1] = modified_start;
-    }
-
-    // iterate over each bucket. Identify how many remaining points there are, and compute their point scheduels
-    uint32_t new_num_points = 0;
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        uint32_t& count = state.bucket_counts[i];
-        uint32_t num_bits = numeric::get_msb(count) + 1;
-        uint32_t new_bucket_count = 0;
-        for (size_t j = 0; j < num_bits; ++j) {
-            uint32_t& current_offset = state.bit_offsets[j];
-            const bool has_entry = ((count >> j) & 1) == 1;
-            if (has_entry) {
-                uint64_t schedule = (static_cast<uint64_t>(current_offset) << 32ULL) + i;
-                state.point_schedule[new_num_points++] = schedule;
-                ++new_bucket_count;
-                ++current_offset;
-            }
-        }
-        count = new_bucket_count;
-    }
-
-    // modify `num_points` to reflect the new number of reduced points.
-    // also swap around the `point_pairs` pointer; what used to be our temporary array
-    // has now become our input point array
-    g1::affine_element* temp = state.point_pairs_1;
-    state.num_points = new_num_points;
-    state.points = state.point_pairs_1;
-    state.point_pairs_1 = state.point_pairs_2;
-    state.point_pairs_2 = temp;
-
-    // We could probably speed this up by unroling the recursion.
-    // But each extra call to `reduce_buckets` has an input size that is ~log(previous input size)
-    // so the extra run-time is meh
-    return reduce_buckets(state, false, handle_edge_cases);
-}
-
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts)
-{
-    // if this is the first call to `construct_addition_chains`, we need to count up our buckets
-    if (empty_bucket_counts) {
-        memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets);
-        const uint32_t first_bucket = static_cast<uint32_t>(state.point_schedule[0] & 0x7fffffffUL);
-        for (size_t i = 0; i < state.num_points; ++i) {
-            size_t bucket_index = static_cast<size_t>(state.point_schedule[i] & 0x7fffffffUL);
-            ++state.bucket_counts[bucket_index - first_bucket];
-        }
-        for (size_t i = 0; i < state.num_buckets; ++i) {
-            state.bucket_empty_status[i] = (state.bucket_counts[i] == 0);
-        }
-    }
-
-    uint32_t max_count = 0;
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        max_count = state.bucket_counts[i] > max_count ? state.bucket_counts[i] : max_count;
-    }
-
-    const uint32_t max_bucket_bits = numeric::get_msb(max_count);
-
-    for (size_t i = 0; i < max_bucket_bits + 1; ++i) {
-        state.bit_offsets[i] = 0;
-    }
-
-    // theoretically, can be unrolled using templated methods.
-    // However, explicitly unrolling the loop by using recursive template calls was slower!
-    // Inner loop is currently bounded by a constexpr variable, need to see what the compiler does with that...
-    count_bits(state.bucket_counts, &state.bit_offsets[0], state.num_buckets, max_bucket_bits);
-
-    // we need to update `bit_offsets` to compute our point shuffle,
-    // but we need the original array later on, so make a copy.
-    std::array<uint32_t, 22> bit_offsets_copy = { 0 };
-    for (size_t i = 0; i < max_bucket_bits + 1; ++i) {
-        bit_offsets_copy[i] = state.bit_offsets[i];
-    }
-
-    // this is where we take each bucket's associated points, and arrange them
-    // in a pairwise order, so that we can compute large sequences of additions using the affine trick
-    size_t schedule_it = 0;
-    uint32_t* bucket_count_it = state.bucket_counts;
-
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        uint32_t count = *bucket_count_it;
-        ++bucket_count_it;
-        uint32_t num_bits = numeric::get_msb(count) + 1;
-        for (size_t j = 0; j < num_bits; ++j) {
-            uint32_t& current_offset = bit_offsets_copy[j];
-            const size_t k_end = count & (1UL << j);
-            // This section is a bottleneck - to populate our point array, we need
-            // to read from memory locations that are effectively uniformly randomly distributed!
-            // (assuming our scalar multipliers are uniformly random...)
-            // In the absence of a more elegant solution, we use ugly macro hacks to try and
-            // unroll loops, and prefetch memory a few cycles before we need it
-            switch (k_end) {
-            case 64: {
-                [[fallthrough]];
-            }
-            case 32: {
-                [[fallthrough]];
-            }
-            case 16: {
-                for (size_t k = 0; k < (k_end >> 4); ++k) {
-                    BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK;
-                }
-                break;
-            }
-            case 8: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 8] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 9] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 10] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 11] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 12] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 13] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 14] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 15] >> 32ULL));
-
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-                const uint64_t schedule_c = state.point_schedule[schedule_it + 2];
-                const uint64_t schedule_d = state.point_schedule[schedule_it + 3];
-                const uint64_t schedule_e = state.point_schedule[schedule_it + 4];
-                const uint64_t schedule_f = state.point_schedule[schedule_it + 5];
-                const uint64_t schedule_g = state.point_schedule[schedule_it + 6];
-                const uint64_t schedule_h = state.point_schedule[schedule_it + 7];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 4,
-                                              (schedule_e >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 5,
-                                              (schedule_f >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 6,
-                                              (schedule_g >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 7,
-                                              (schedule_h >> 31ULL) & 1ULL);
-
-                current_offset += 8;
-                schedule_it += 8;
-                break;
-            }
-            case 4: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-                const uint64_t schedule_c = state.point_schedule[schedule_it + 2];
-                const uint64_t schedule_d = state.point_schedule[schedule_it + 3];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
-                current_offset += 4;
-                schedule_it += 4;
-                break;
-            }
-            case 2: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                current_offset += 2;
-                schedule_it += 2;
-                break;
-            }
-            case 1: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                ++current_offset;
-                ++schedule_it;
-                break;
-            }
-            case 0: {
-                break;
-            }
-            default: {
-                for (size_t k = 0; k < k_end; ++k) {
-                    uint64_t schedule = state.point_schedule[schedule_it];
-                    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 1] >> 32ULL));
-
-                    const uint64_t predicate = (schedule >> 31UL) & 1UL;
-
-                    g1::conditional_negate_affine(
-                        state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate);
-                    ++current_offset;
-                    ++schedule_it;
-                }
-            }
-            }
-        }
-    }
-    return max_bucket_bits;
-}
-
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases)
-{
-    const size_t num_rounds = get_num_rounds(num_points);
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-
-    std::unique_ptr<g1::element[], decltype(&aligned_free)> thread_accumulators(
-        static_cast<g1::element*>(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free);
-
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t j = 0; j < num_threads; ++j) {
-        thread_accumulators[j].self_set_infinity();
-
-        for (size_t i = 0; i < num_rounds; ++i) {
-
-            const uint64_t num_round_points = state.round_counts[i];
-
-            g1::element accumulator;
-            accumulator.self_set_infinity();
-
-            if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) {
-            } else {
-
-                const uint64_t num_round_points_per_thread = num_round_points / num_threads;
-                const uint64_t leftovers =
-                    (j == num_threads - 1) ? (num_round_points) - (num_round_points_per_thread * num_threads) : 0;
-
-                uint64_t* thread_point_schedule =
-                    &state.point_schedule[(i * num_points) + j * num_round_points_per_thread];
-                const size_t first_bucket = thread_point_schedule[0] & 0x7fffffffU;
-                const size_t last_bucket =
-                    thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU;
-                const size_t num_thread_buckets = (last_bucket - first_bucket) + 1;
-
-                affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j);
-                product_state.num_points = static_cast<uint32_t>(num_round_points_per_thread + leftovers);
-                product_state.points = points;
-                product_state.point_schedule = thread_point_schedule;
-                product_state.num_buckets = static_cast<uint32_t>(num_thread_buckets);
-                g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases);
-                g1::element running_sum;
-                running_sum.self_set_infinity();
-
-                // one nice side-effect of the affine trick, is that half of the bucket concatenation
-                // algorithm can use mixed addition formulae, instead of full addition formulae
-                size_t output_it = product_state.num_points - 1;
-                for (size_t k = num_thread_buckets - 1; k > 0; --k) {
-                    if (__builtin_expect(!product_state.bucket_empty_status[k], 1)) {
-                        running_sum += (output_buckets[output_it]);
-                        --output_it;
-                    }
-                    accumulator += running_sum;
-                }
-                running_sum += output_buckets[0];
-                accumulator.self_dbl();
-                accumulator += running_sum;
-
-                // we now need to scale up 'running sum' up to the value of the first bucket.
-                // e.g. if first bucket is 0, no scaling
-                // if first bucket is 1, we need to add (2 * running_sum)
-                if (first_bucket > 0) {
-                    uint32_t multiplier = static_cast<uint32_t>(first_bucket << 1UL);
-                    size_t shift = numeric::get_msb(multiplier);
-                    g1::element rolling_accumulator = g1::point_at_infinity;
-                    bool init = false;
-                    while (shift != static_cast<size_t>(-1)) {
-                        if (init) {
-                            rolling_accumulator.self_dbl();
-                            if (((multiplier >> shift) & 1)) {
-                                rolling_accumulator += running_sum;
-                            }
-                        } else {
-                            rolling_accumulator += running_sum;
-                        }
-                        init = true;
-                        shift -= 1;
-                    }
-                    accumulator += rolling_accumulator;
-                }
-            }
-
-            if (i == (num_rounds - 1)) {
-                const size_t num_points_per_thread = num_points / num_threads;
-                bool* skew_table = &state.skew_table[j * num_points_per_thread];
-                g1::affine_element* point_table = &points[j * num_points_per_thread];
-                g1::affine_element addition_temporary;
-                for (size_t k = 0; k < num_points_per_thread; ++k) {
-                    if (skew_table[k]) {
-                        addition_temporary = -point_table[k];
-                        accumulator += addition_temporary;
-                    }
-                }
-            }
-
-            if (i > 0) {
-                for (size_t k = 0; k < bits_per_bucket + 1; ++k) {
-                    thread_accumulators[j].self_dbl();
-                }
-            }
-            thread_accumulators[j] += accumulator;
-        }
-    }
-
-    g1::element result;
-    result.self_set_infinity();
-    for (size_t i = 0; i < num_threads; ++i) {
-        result += thread_accumulators[i];
-    }
-    return result;
-}
-
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases)
-{
-    // multiplication_runtime_state state;
-    compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2);
-    g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases);
-    return result;
-}
-
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_initial_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases)
-{
-    // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points.
-    // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm.
-    // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger
-#ifndef NO_MULTITHREADING
-    const size_t threshold = std::max(max_threads::compute_num_threads() * 8, 8UL);
-#else
-    const size_t threshold = 8UL;
-#endif
-
-    if (num_initial_points == 0) {
-        g1::element out = g1::one;
-        out.self_set_infinity();
-        return out;
-    }
-
-    if (num_initial_points <= threshold) {
-        std::vector<g1::element> exponentiation_results(num_initial_points);
-        // might as well multithread this...
-        // Possible optimization: use group::batch_mul_with_endomorphism here.
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-        for (size_t i = 0; i < num_initial_points; ++i) {
-            exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i];
-        }
-
-        for (size_t i = num_initial_points - 1; i > 0; --i) {
-            exponentiation_results[i - 1] += exponentiation_results[i];
-        }
-        return exponentiation_results[0];
-    }
-
-    const size_t slice_bits = static_cast<size_t>(numeric::get_msb(static_cast<uint64_t>(num_initial_points)));
-    const size_t num_slice_points = static_cast<size_t>(1ULL << slice_bits);
-
-    g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases);
-
-    if (num_slice_points != num_initial_points) {
-        const uint64_t leftover_points = num_initial_points - num_slice_points;
-        return result + pippenger(scalars + num_slice_points,
-                                  points + static_cast<size_t>(num_slice_points * 2),
-                                  static_cast<size_t>(leftover_points),
-                                  state,
-                                  handle_edge_cases);
-    } else {
-        return result;
-    }
-}
-
-/**
- * It's pippenger! But this one has go-faster stripes and a prediliction for questionable life choices.
- * We use affine-addition formula in this method, which paradoxically is ~45% faster than the mixed addition formulae.
- * See `scalar_multiplication.cpp` for a more detailed description.
- *
- * It's...unsafe, because we assume that the incomplete addition formula exceptions are not triggered.
- * We don't bother to check for this to avoid conditional branches in a critical section of our code.
- * This is fine for situations where your bases are linearly independent (i.e. KZG10 polynomial commitments),
- * because triggering the incomplete addition exceptions is about as hard as solving the disrete log problem.
- *
- * This is ok for the prover, but GIANT RED CLAXON WARNINGS FOR THE VERIFIER
- * Don't use this in a verification algorithm! That would be a really bad idea.
- * Unless you're a malicious adversary, then it would be a great idea!
- *
- **/
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state)
-{
-    return pippenger(scalars, points, num_initial_points, state, false);
-}
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state)
-{
-    std::vector<g1::affine_element> G_mod(num_initial_points * 2);
-    barretenberg::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points);
-    return pippenger(scalars, &G_mod[0], num_initial_points, state, false);
-}
-} // namespace scalar_multiplication
-} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp
deleted file mode 100644
index 36613a47a9..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp
+++ /dev/null
@@ -1,155 +0,0 @@
-#pragma once
-
-#include "../fr.hpp"
-#include "../g1.hpp"
-#include "./runtime_states.hpp"
-#include <stddef.h>
-#include <stdint.h>
-
-namespace barretenberg {
-namespace scalar_multiplication {
-
-constexpr size_t get_num_buckets(const size_t num_points)
-{
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-    return 1UL << bits_per_bucket;
-}
-
-/**
- * pointers that describe how to add points into buckets, for the pippenger algorithm.
- * `wnaf_table` is an unrolled two-dimensional array, with each inner array being of size `n`,
- * where `n` is the number of points being multiplied. The second dimension size is defined by
- * the number of pippenger rounds (fixed for a given `n`, see `get_num_rounds`)
- *
- * An entry of `wnaf_table` contains the following three pieces of information:
- * 1: the point index that we're working on. This is stored in the high 32 bits
- * 2: the bucket index that we're adding the point into. This is stored in the low 31 bits
- * 3: the sign of the point we're adding (i.e. do we actually need to subtract). This is stored in the 32nd bit.
- *
- * We pack this information into a 64 bit unsigned integer, so that we can more efficiently sort our wnaf entries.
- * For a given round, we want to sort our wnaf entries in increasing bucket index order.
- *
- * This is so that we can efficiently use multiple threads to execute the pippenger algorithm.
- * For a given round, a given point's bucket index will be uniformly randomly distributed,
- * assuming the inputs are from a zero-knowledge proof. This is because the scalar multiplier will be uniformly randomly
- *distributed, and the bucket indices are derived from the scalar multiplier.
- *
- * This means that, if we were to iterate over all of our points in order, and add each point into its associated
- *bucket, we would be accessing all of our buckets in a completely random pattern.
- *
- * Aside from memory latency problems this incurs, this makes the naive algorithm unsuitable for multithreading - we
- *cannot assign a thread a tranche of points, because each thread will be adding points into the same set of buckets,
- *triggering race conditions. We do not want to manage the overhead of thread locks for each bucket; the process of
- *adding a point into a bucket takes, on average, only 400 CPU cycles, so the slowdown of managing mutex locks would add
- *considerable overhead.
- *
- * The solution is to sort the buckets. If the buckets are sorted, we can assign a tranche of buckets to individual
- *threads, safe in the knowledge that there will be no race conditions, with one condition. A thread's starting bucket
- *may be equal to the previous thread's end bucket, so we need to ensure that each thread works on a local array of
- *buckets. This adds little overhead (for 2^20 points, we have 32,768 buckets. With 8 threads, the amount of bucket
- *overlap is ~16 buckets, so we could incur 16 extra 'additions' in pippenger's bucket concatenation phase, but this is
- *an insignificant contribution).
- *
- * The alternative approach (the one we used to use) is to slice up all of the points being multiplied amongst all
- *available threads, and run the complete pippenger algorithm for each thread. This is suboptimal, because the
- *complexity of pippenger is O(n / logn) point additions, and a sequence of smaller pippenger calls will have a smaller
- *`n`.
- *
- * This is the motivation for multi-threading the actual Pippenger algorithm. In addition, the above approach performs
- *extremely poorly for GPUs, where the number of threads can be as high as 2^10 (for a multi-scalar-multiplication of
- *2^20 points, this doubles the number of pippenger rounds per thread)
- *
- * To give concrete numbers, the difference between calling pippenger on 2^20 points, and calling pippenger 8 times on
- *2^17 points, is 5-10%. Which means that, for 8 threads, we need to ensure that our sorting algorithm adds less than 5%
- *to the total runtime of pippenger. Given a single cache miss per point would increase the run-time by 25%, this is not
- *much room to work with!
- *
- * However, a radix sort, combined with the fact that the total number of buckets is quite small (2^16 at most), seems
- *to be fast enough. Benchmarks indicate (i7-8650U, 8 threads) that, for 2^20 points, the total runtime is <1200ms and
- *of that, the radix sort consumes 58ms (4.8%)
- *
- * One advantage of sorting by bucket order vs point order, is that a 'bucket' is 96 bytes large (sizeof(g1::element),
- *buckets have z-coordinates). Points, on the other hand, are 64 bytes large (affine points, no z-coordinate). This
- *makes fetching random point locations in memory more efficient than fetching random bucket locations, as each point
- *occupies a single cache line. Using __builtin_prefetch to recover the point just before it's needed, seems to improve
- *the runtime of pippenger by 10-20%.
- *
- * Finally, `skew_table` tracks whether a scalar multplier is even or odd
- * (if it's even, we need to subtract the point from the total result,
- * because our windowed non-adjacent form values can only be odd)
- *
- **/
-
-struct multiplication_thread_state {
-    g1::element* buckets;
-    const uint64_t* point_schedule;
-};
-
-void compute_wnaf_states(uint64_t* point_schedule,
-                         bool* input_skew_table,
-                         uint64_t* round_counts,
-                         const fr* scalars,
-                         const size_t num_initial_points);
-
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points);
-
-void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points);
-
-inline void count_bits(uint32_t* bucket_counts,
-                       uint32_t* bit_offsets,
-                       const uint32_t num_buckets,
-                       const size_t num_bits)
-{
-    for (size_t i = 0; i < num_buckets; ++i) {
-        const uint32_t count = bucket_counts[i];
-        for (uint32_t j = 0; j < num_bits; ++j) {
-            bit_offsets[j + 1] += (count & (1U << j));
-        }
-    }
-    bit_offsets[0] = 0;
-    for (size_t i = 2; i < num_bits + 1; ++i) {
-        bit_offsets[i] += bit_offsets[i - 1];
-    }
-}
-
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true);
-
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space);
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space);
-
-void evaluate_addition_chains(affine_product_runtime_state& state,
-                              const size_t max_bucket_bits,
-                              bool handle_edge_cases);
-
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases);
-
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases = false);
-
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state,
-                                   bool first_round = true,
-                                   bool handle_edge_cases = false);
-
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases = true);
-
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state);
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state);
-
-} // namespace scalar_multiplication
-} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp
deleted file mode 100644
index bd0875ba96..0000000000
--- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp
+++ /dev/null
@@ -1,937 +0,0 @@
-#include "pippenger.hpp"
-#include "scalar_multiplication.hpp"
-#include <chrono>
-#include "barretenberg/common/test.hpp"
-#include "barretenberg/srs/io.hpp"
-#include <vector>
-
-#include "barretenberg/numeric/random/engine.hpp"
-
-#include "barretenberg/common/mem.hpp"
-
-#define BARRETENBERG_SRS_PATH "../srs_db/ignition"
-
-using namespace barretenberg;
-using namespace barretenberg::scalar_multiplication;
-
-namespace {
-auto& engine = numeric::random::get_debug_engine();
-}
-
-TEST(scalar_multiplication, reduce_buckets_simple)
-{
-    constexpr size_t num_points = 128;
-    g2::affine_element g2_x;
-    io::read_transcript_g2(g2_x, BARRETENBERG_SRS_PATH);
-    auto pippenger = Pippenger(BARRETENBERG_SRS_PATH, num_points / 2);
-    auto monomials = pippenger.get_point_table();
-
-    std::vector<uint64_t> point_schedule(scalar_multiplication::point_table_size(num_points / 2));
-    std::array<bool, num_points> bucket_empty_status;
-    // 16 buckets, each bucket has one point
-    std::array<uint64_t, num_points> transcript;
-    std::array<uint64_t, num_points> transcript_points;
-    transcript_points[0] = 0x0;
-    transcript_points[1] = 0x2;
-    transcript_points[2] = 0x4;
-    transcript_points[3] = 0x6;
-    transcript_points[4] = 0xb;
-    transcript_points[5] = 0xc;
-    transcript_points[6] = 0xe;
-    transcript_points[7] = 0x11;
-    transcript_points[8] = 0x13;
-    transcript_points[9] = 0x14;
-    transcript_points[10] = 0x15;
-    transcript_points[11] = 0x16;
-    transcript_points[12] = 0x17;
-    transcript_points[13] = 0x18;
-    transcript_points[14] = 0x20;
-    transcript_points[15] = 0x21;
-    transcript_points[16] = 0x22;
-    transcript_points[17] = 0x27;
-    transcript_points[18] = 0x29;
-    transcript_points[19] = 0x2b;
-    transcript_points[20] = 0x2c;
-    transcript_points[21] = 0x2d;
-    transcript_points[22] = 0x2e;
-    transcript_points[23] = 0x36;
-    transcript_points[24] = 0x37;
-    transcript_points[25] = 0x38;
-    transcript_points[26] = 0x3e;
-    transcript_points[27] = 0x3f;
-    transcript_points[28] = 0x4e;
-    transcript_points[29] = 0x4f;
-    transcript_points[30] = 0x50;
-    transcript_points[31] = 0x51;
-    transcript_points[32] = 0x41;
-    transcript_points[33] = 0x52;
-    transcript_points[34] = 0x53;
-    transcript_points[35] = 0x54;
-    transcript_points[36] = 0x43;
-    transcript_points[37] = 0x57;
-    transcript_points[38] = 0x46;
-    transcript_points[39] = 0x58;
-    transcript_points[40] = 0x5b;
-    transcript_points[41] = 0x5e;
-    transcript_points[42] = 0x42;
-    transcript_points[43] = 0x47;
-    transcript_points[44] = 0x4b;
-    transcript_points[45] = 0x4d;
-    transcript_points[46] = 0x6b;
-    transcript_points[47] = 0x65;
-    transcript_points[48] = 0x6d;
-    transcript_points[49] = 0x67;
-    transcript_points[50] = 0x6f;
-    transcript_points[51] = 0x68;
-    transcript_points[52] = 0x69;
-    transcript_points[53] = 0x6a;
-    transcript_points[54] = 0x71;
-    transcript_points[55] = 0x72;
-    transcript_points[56] = 0x73;
-    transcript_points[57] = 0x74;
-    transcript_points[58] = 0x75;
-    transcript_points[59] = 0x66;
-    transcript_points[60] = 0x79;
-    transcript_points[62] = 0x7c;
-    transcript_points[61] = 0x7e;
-    transcript_points[63] = 0x7f;
-    transcript_points[64] = 0x1;
-    transcript_points[65] = 0x3;
-    transcript_points[66] = 0x5;
-    transcript_points[67] = 0x7;
-    transcript_points[68] = 0x8;
-    transcript_points[69] = 0x9;
-    transcript_points[70] = 0xa;
-    transcript_points[71] = 0xd;
-    transcript_points[72] = 0xf;
-    transcript_points[73] = 0x10;
-    transcript_points[74] = 0x12;
-    transcript_points[75] = 0x19;
-    transcript_points[76] = 0x1a;
-    transcript_points[77] = 0x1b;
-    transcript_points[78] = 0x1c;
-    transcript_points[79] = 0x1d;
-    transcript_points[80] = 0x1e;
-    transcript_points[81] = 0x1f;
-    transcript_points[82] = 0x23;
-    transcript_points[83] = 0x24;
-    transcript_points[84] = 0x25;
-    transcript_points[85] = 0x26;
-    transcript_points[86] = 0x28;
-    transcript_points[87] = 0x2a;
-    transcript_points[88] = 0x2f;
-    transcript_points[89] = 0x30;
-    transcript_points[90] = 0x31;
-    transcript_points[91] = 0x32;
-    transcript_points[92] = 0x33;
-    transcript_points[93] = 0x34;
-    transcript_points[94] = 0x35;
-    transcript_points[95] = 0x39;
-    transcript_points[96] = 0x3a;
-    transcript_points[97] = 0x3b;
-    transcript_points[98] = 0x3c;
-    transcript_points[99] = 0x3d;
-    transcript_points[100] = 0x48;
-    transcript_points[101] = 0x49;
-    transcript_points[102] = 0x55;
-    transcript_points[103] = 0x56;
-    transcript_points[104] = 0x4a;
-    transcript_points[105] = 0x44;
-    transcript_points[106] = 0x45;
-    transcript_points[107] = 0x40;
-    transcript_points[108] = 0x59;
-    transcript_points[109] = 0x5a;
-    transcript_points[110] = 0x5c;
-    transcript_points[111] = 0x5d;
-    transcript_points[112] = 0x5f;
-    transcript_points[113] = 0x60;
-    transcript_points[114] = 0x61;
-    transcript_points[115] = 0x62;
-    transcript_points[116] = 0x63;
-    transcript_points[117] = 0x4c;
-    transcript_points[118] = 0x6c;
-    transcript_points[119] = 0x6e;
-    transcript_points[120] = 0x64;
-    transcript_points[121] = 0x70;
-    transcript_points[122] = 0x77;
-    transcript_points[123] = 0x78;
-    transcript_points[124] = 0x76;
-    transcript_points[125] = 0x7a;
-    transcript_points[126] = 0x7b;
-    transcript_points[127] = 0x7d;
-
-    for (size_t i = 0; i < 64; ++i) {
-        transcript[i] = 0;
-        transcript[i + 64] = 1;
-    }
-    for (size_t i = 0; i < num_points; ++i) {
-        point_schedule[i] = (static_cast<uint64_t>(transcript_points[i]) << 32ULL) + transcript[i];
-    }
-    std::array<g1::element, num_points> expected;
-    for (size_t i = 0; i < num_points; ++i) {
-        expected[i].self_set_infinity();
-    }
-
-    for (size_t i = 0; i < num_points; ++i) {
-        size_t schedule = transcript[i] & 0x7fffffffU;
-        {
-            expected[schedule] += monomials[static_cast<size_t>(transcript_points[i])];
-        }
-    }
-
-    std::array<g1::affine_element, num_points> point_pairs;
-    std::array<g1::affine_element, num_points> output_buckets;
-    std::array<fq, num_points> scratch_space;
-    std::array<uint32_t, num_points> bucket_counts;
-    std::array<uint32_t, num_points> bit_offsets = { 0 };
-
-    scalar_multiplication::affine_product_runtime_state product_state{
-        &monomials[0],          &point_pairs[0],   &output_buckets[0],
-        &scratch_space[0],      &bucket_counts[0], &bit_offsets[0],
-        &point_schedule[0],     num_points,        2,
-        &bucket_empty_status[0]
-    };
-
-    g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true);
-
-    for (size_t i = 0; i < product_state.num_buckets; ++i) {
-        expected[i] = expected[i].normalize();
-        EXPECT_EQ((output[i].x == expected[i].x), true);
-        EXPECT_EQ((output[i].y == expected[i].y), true);
-    }
-}
-
-TEST(scalar_multiplication, reduce_buckets)
-{
-    constexpr size_t num_initial_points = 1 << 12;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::affine_element* scratch_points =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::affine_element* point_pairs =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::element* expected_buckets = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points * 2)));
-    bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points * 2)));
-
-    memset((void*)scratch_points, 0x00, (num_points * 2) * sizeof(g1::affine_element));
-    memset((void*)point_pairs, 0x00, (num_points * 2) * sizeof(g1::affine_element));
-    memset((void*)expected_buckets, 0x00, (num_points * 2) * sizeof(g1::element));
-    memset((void*)bucket_empty_status, 0x00, (num_points * 2) * sizeof(bool));
-
-    fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points)));
-
-    memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
-
-    g2::affine_element g2_x;
-    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
-
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        scalars[i] = fr::random_element();
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * 100 * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-
-    uint64_t* point_schedule_copy = static_cast<uint64_t*>(aligned_alloc(64, sizeof(uint64_t) * num_points * 2));
-    for (size_t i = 0; i < num_points; ++i) {
-        state.point_schedule[i + num_points] = state.point_schedule[i + num_points] & 0xffffffff7fffffffUL;
-        // printf("state.point_schedule[%lu] = %lx \n", i, state.point_schedule[i]);
-        point_schedule_copy[i] = state.point_schedule[i + num_points];
-    }
-    const size_t first_bucket = point_schedule_copy[0] & 0x7fffffffULL;
-    const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       &state.point_schedule[num_points],
-                                                                       num_points,
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    for (size_t i = 0; i < num_points; ++i) {
-        expected_buckets[i].self_set_infinity();
-    }
-    for (size_t i = 0; i < num_points; ++i) {
-        uint64_t schedule = point_schedule_copy[i];
-        uint64_t bucket_index = schedule & 0x7fffffffU;
-        uint64_t point_index = schedule >> 32ULL;
-        uint64_t predicate = (schedule >> 31ULL) & 1ULL;
-        // printf("expected bucket index = %lu \n", bucket_index - first_bucket);
-        g1::element& bucket = expected_buckets[bucket_index - first_bucket];
-        g1::affine_element& point = monomials[point_index];
-        bucket.self_mixed_add_or_sub(point, predicate);
-    }
-
-    size_t it = 0;
-
-    g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true);
-
-    printf("num buckets = %zu \n", num_buckets);
-    for (size_t i = 0; i < num_buckets; ++i) {
-        if (!bucket_empty_status[i]) {
-            g1::element expected = expected_buckets[i].normalize();
-            EXPECT_EQ((expected.x == result_buckets[it].x), true);
-            EXPECT_EQ((expected.y == result_buckets[it].y), true);
-            ++it;
-        } else {
-            printf("recorded empty bucket???\n");
-        }
-    }
-    aligned_free(bucket_empty_status);
-    aligned_free(expected_buckets);
-    aligned_free(point_schedule_copy);
-    aligned_free(point_pairs);
-    aligned_free(scratch_points);
-    aligned_free(scratch_field);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-// This test intermittenly fails.
-TEST(scalar_multiplication, DISABLED_reduce_buckets_basic)
-{
-    constexpr size_t num_initial_points = 1 << 20;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    g1::affine_element* scratch_points =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    g1::affine_element* point_pairs =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points)));
-
-    fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points)));
-
-    memset((void*)scratch_points, 0x00, num_points * sizeof(g1::affine_element));
-    memset((void*)point_pairs, 0x00, num_points * sizeof(g1::affine_element));
-    memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
-    memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool));
-
-    g2::affine_element g2_x;
-    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-    const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL;
-    const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       (uint32_t)state.round_counts[0],
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::reduce_buckets(product_state, true);
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    aligned_free(bucket_empty_status);
-    aligned_free(point_pairs);
-    aligned_free(scratch_points);
-    aligned_free(scratch_field);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-TEST(scalar_multiplication, add_affine_points)
-{
-    constexpr size_t num_points = 20;
-    g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    fq* scratch_space = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2)));
-    fq* lambda = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2)));
-
-    g1::element* points_copy = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points)));
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-        points_copy[i].x = points[i].x;
-        points_copy[i].y = points[i].y;
-        points_copy[i].z = fq::one();
-    }
-
-    size_t count = num_points - 1;
-    for (size_t i = num_points - 2; i < num_points; i -= 2) {
-        points_copy[count--] = points_copy[i] + points_copy[i + 1];
-        points_copy[count + 1] = points_copy[count + 1].normalize();
-    }
-
-    scalar_multiplication::add_affine_points(points, num_points, scratch_space);
-    for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) {
-        EXPECT_EQ((points[i].x == points_copy[i].x), true);
-        EXPECT_EQ((points[i].y == points_copy[i].y), true);
-    }
-    aligned_free(lambda);
-    aligned_free(points);
-    aligned_free(points_copy);
-    aligned_free(scratch_space);
-}
-
-TEST(scalar_multiplication, construct_addition_chains)
-{
-    constexpr size_t num_initial_points = 1 << 20;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-
-    g2::affine_element g2_x;
-    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-    bool* bucket_empty_status = static_cast<bool*>(aligned_alloc(64, num_points * sizeof(bool)));
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-    const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL;
-    const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       monomials,
-                                                                       monomials,
-                                                                       nullptr,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       static_cast<uint32_t>(state.round_counts[0]),
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::construct_addition_chains(product_state, true);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    info("construct addition chains: ", diff.count(), "ms");
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    aligned_free(bucket_empty_status);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-TEST(scalar_multiplication, endomorphism_split)
-{
-    fr scalar = fr::random_element();
-
-    g1::element expected = g1::one * scalar;
-
-    // we want to test that we can split a scalar into two half-length components, using the same location in memory.
-    fr* k1_t = &scalar;
-    fr* k2_t = (fr*)&scalar.data[2];
-
-    fr::split_into_endomorphism_scalars(scalar, *k1_t, *k2_t);
-    // The compiler really doesn't like what we're doing here,
-    // and disabling the array-bounds error project-wide seems unsafe.
-    // The large macro blocks are here to warn that we should be careful when
-    // aliasing the arguments to split_into_endomorphism_scalars
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-    fr k1{ (*k1_t).data[0], (*k1_t).data[1], 0, 0 };
-    fr k2{ (*k2_t).data[0], (*k2_t).data[1], 0, 0 };
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-    g1::element result;
-    g1::element t1 = g1::affine_one * k1;
-    g1::affine_element generator = g1::affine_one;
-    fq beta = fq::cube_root_of_unity();
-    generator.x = generator.x * beta;
-    generator.y = -generator.y;
-    g1::element t2 = generator * k2;
-    result = t1 + t2;
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, radix_sort)
-{
-    // check that our radix sort correctly sorts!
-    constexpr size_t target_degree = 1 << 8;
-    constexpr size_t num_rounds = scalar_multiplication::get_num_rounds(target_degree * 2);
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < target_degree; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree);
-
-    uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds));
-    memcpy((void*)wnaf_copy, (void*)state.point_schedule, sizeof(uint64_t) * target_degree * 2 * num_rounds);
-
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, target_degree * 2);
-    for (size_t i = 0; i < num_rounds; ++i) {
-        uint64_t* unsorted_wnaf = &wnaf_copy[i * target_degree * 2];
-        uint64_t* sorted_wnaf = &state.point_schedule[i * target_degree * 2];
-
-        const auto find_entry = [unsorted_wnaf, num_entries = target_degree * 2](auto x) {
-            for (size_t k = 0; k < num_entries; ++k) {
-                if (unsorted_wnaf[k] == x) {
-                    return true;
-                }
-            }
-            return false;
-        };
-        for (size_t j = 0; j < target_degree * 2; ++j) {
-            EXPECT_EQ(find_entry(sorted_wnaf[j]), true);
-            if (j > 0) {
-                EXPECT_EQ((sorted_wnaf[j] & 0x7fffffffU) >= (sorted_wnaf[j - 1] & 0x7fffffffU), true);
-            }
-        }
-    }
-
-    free(scalars);
-    free(wnaf_copy);
-}
-
-HEAVY_TEST(scalar_multiplication, oversized_inputs)
-{
-    // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps.
-    // Check that this is done correctly
-    size_t transcript_degree = 1 << 20;
-    size_t target_degree = 1200000;
-    g1::affine_element* monomials =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree)));
-    g2::affine_element g2_x;
-    io::read_transcript(monomials, g2_x, transcript_degree, BARRETENBERG_SRS_PATH);
-
-    memcpy((void*)(monomials + (2 * transcript_degree)),
-           (void*)monomials,
-           ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element)));
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree));
-
-    fr source_scalar = fr::random_element();
-    fr accumulator = source_scalar;
-    for (size_t i = 0; i < target_degree; ++i) {
-        accumulator *= source_scalar;
-        fr::__copy(accumulator, scalars[i]);
-    }
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
-
-    g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state);
-    first = first.normalize();
-
-    for (size_t i = 0; i < target_degree; ++i) {
-        scalars[i].self_neg();
-    }
-    scalar_multiplication::pippenger_runtime_state state_2(target_degree);
-
-    g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2);
-    second = second.normalize();
-
-    EXPECT_EQ((first.z == second.z), true);
-    EXPECT_EQ((first.z == fq::one()), true);
-    EXPECT_EQ((first.x == second.x), true);
-    EXPECT_EQ((first.y == -second.y), true);
-
-    aligned_free(monomials);
-    aligned_free(scalars);
-}
-
-TEST(scalar_multiplication, undersized_inputs)
-{
-    // we fall back to traditional scalar multiplication algorithm for small input sizes.
-    // Check this is done correctly
-    size_t num_points = 17;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_edge_case_dbl)
-{
-    constexpr size_t num_points = 128;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    g1::affine_element point = g1::affine_element(g1::element::random_element());
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = point;
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    if (!expected.is_point_at_infinity()) {
-        expected = expected.normalize();
-    }
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_short_inputs)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points = scalar_multiplication::point_table_alloc<g1::affine_element>(num_points);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-    for (size_t i = 0; i < (num_points / 4); ++i) {
-        scalars[i * 4].data[0] = engine.get_random_uint32();
-        scalars[i * 4].data[1] = engine.get_random_uint32();
-        scalars[i * 4].data[2] = engine.get_random_uint32();
-        scalars[i * 4].data[3] = engine.get_random_uint32();
-        scalars[i * 4] = scalars[i * 4].to_montgomery_form();
-        scalars[i * 4 + 1].data[0] = 0;
-        scalars[i * 4 + 1].data[1] = 0;
-        scalars[i * 4 + 1].data[2] = 0;
-        scalars[i * 4 + 1].data[3] = 0;
-        scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form();
-        scalars[i * 4 + 2].data[0] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[1] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[2] = 0;
-        scalars[i * 4 + 2].data[3] = 0;
-        scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form();
-        scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL);
-        scalars[i * 4 + 3].data[1] = 0;
-        scalars[i * 4 + 3].data[2] = 0;
-        scalars[i * 4 + 3].data[3] = 0;
-        scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form();
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_unsafe)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points = scalar_multiplication::point_table_alloc<g1::affine_element>(num_points);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_unsafe_short_inputs)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-    for (size_t i = 0; i < (num_points / 4); ++i) {
-        scalars[i * 4].data[0] = engine.get_random_uint32();
-        scalars[i * 4].data[1] = engine.get_random_uint32();
-        scalars[i * 4].data[2] = engine.get_random_uint32();
-        scalars[i * 4].data[3] = engine.get_random_uint32();
-        scalars[i * 4] = scalars[i * 4].to_montgomery_form();
-        scalars[i * 4 + 1].data[0] = 0;
-        scalars[i * 4 + 1].data[1] = 0;
-        scalars[i * 4 + 1].data[2] = 0;
-        scalars[i * 4 + 1].data[3] = 0;
-        scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form();
-        scalars[i * 4 + 2].data[0] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[1] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[2] = 0;
-        scalars[i * 4 + 2].data[3] = 0;
-        scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form();
-        scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL);
-        scalars[i * 4 + 3].data[1] = 0;
-        scalars[i * 4 + 3].data[2] = 0;
-        scalars[i * 4 + 3].data[3] = 0;
-        scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form();
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_one)
-{
-    size_t num_points = 1;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * 1);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(scalar_multiplication, pippenger_zero_points)
-{
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
-
-    g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1);
-
-    scalar_multiplication::pippenger_runtime_state state(0);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state);
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result.is_point_at_infinity(), true);
-}
-
-TEST(scalar_multiplication, pippenger_mul_by_zero)
-{
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
-
-    g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1);
-
-    scalars[0] = fr::zero();
-    points[0] = g1::affine_one;
-    scalar_multiplication::generate_pippenger_point_table(points, points, 1);
-
-    scalar_multiplication::pippenger_runtime_state state(1);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state);
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result.is_point_at_infinity(), true);
-}
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp
index c1a9bc0457..cd575a16d6 100644
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp
+++ b/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp
@@ -39,7 +39,8 @@ class Grumpkin {
   public:
     using ScalarField = barretenberg::fq;
     using BaseField = barretenberg::fr;
-    using ProjectiveElement = typename grumpkin::g1::element;
-    using AffineElement = typename grumpkin::g1::affine_element;
+    using Group = typename grumpkin::g1;
+    using Element = typename Group::element;
+    using AffineElement = typename Group::affine_element;
 };
 } // namespace curve
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp
deleted file mode 100644
index b490f4973d..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// #include "scalar_multiplication.hpp"
-// #include "pippenger.hpp"
-// #include "barretenberg/common/mem.hpp"
-
-// using namespace barretenberg;
-
-// #define WASM_EXPORT __attribute__((visibility("default")))
-
-// extern "C" {
-
-// WASM_EXPORT void* bbmalloc(size_t size)
-// {
-//     auto ptr = aligned_alloc(64, size);
-//     return ptr;
-// }
-
-// WASM_EXPORT void bbfree(void* ptr)
-// {
-//     aligned_free(ptr);
-// }
-
-// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points)
-// {
-//     auto ptr = new scalar_multiplication::Pippenger(points, num_points);
-//     return ptr;
-// }
-
-// WASM_EXPORT void delete_pippenger(void* pippenger)
-// {
-//     delete reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger);
-// }
-
-// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void*
-// result_ptr)
-// {
-//     scalar_multiplication::pippenger_runtime_state state(range);
-//     auto pippenger = reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger_ptr);
-//     auto scalars = reinterpret_cast<fr*>(scalars_ptr);
-//     auto result = reinterpret_cast<g1::element*>(result_ptr);
-//     *result = pippenger->pippenger_unsafe(scalars, from, range);
-// }
-
-// WASM_EXPORT void g1_sum(void* points_ptr, const size_t num_points, void* result_ptr)
-// {
-//     auto points = reinterpret_cast<g1::element*>(points_ptr);
-//     auto result = reinterpret_cast<g1::element*>(result_ptr);
-//     result->self_set_infinity();
-//     *result = std::accumulate(points, points + num_points, *result);
-// }
-// }
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp
deleted file mode 100644
index 60af544690..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// #include <cstddef>
-// #include "../g1.hpp"
-
-// #define WASM_EXPORT __attribute__((visibility("default")))
-
-// extern "C" {
-
-// WASM_EXPORT void* bbmalloc(size_t size);
-
-// WASM_EXPORT void bbfree(void* ptr);
-
-// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points);
-
-// WASM_EXPORT void delete_pippenger(void* pippenger);
-
-// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void*
-// result_ptr); WASM_EXPORT void g1_sum(void* points_ptr, size_t num_points, void* result_ptr);
-// }
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp
deleted file mode 100644
index 08c6b62960..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "pippenger.hpp"
-#include "barretenberg/srs/io.hpp"
-namespace grumpkin {
-namespace scalar_multiplication {
-
-Pippenger::Pippenger(g1::affine_element* points, size_t num_points)
-    : monomials_(points)
-    , num_points_(num_points)
-{
-    grumpkin::io::byteswap(&monomials_[0], num_points * 64);
-    scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-Pippenger::Pippenger(uint8_t const* points, size_t num_points)
-    : num_points_(num_points)
-{
-    monomials_ = point_table_alloc<g1::affine_element>(num_points);
-
-    grumpkin::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-Pippenger::Pippenger(std::string const& path, size_t num_points)
-    : num_points_(num_points)
-{
-    monomials_ = point_table_alloc<g1::affine_element>(num_points);
-
-    grumpkin::io::read_transcript_g1(monomials_, num_points, path);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
-}
-
-g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range)
-{
-    scalar_multiplication::pippenger_runtime_state state(range);
-    return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state);
-}
-
-Pippenger::~Pippenger()
-{
-    free(monomials_);
-}
-
-} // namespace scalar_multiplication
-} // namespace grumpkin
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp
deleted file mode 100644
index a6c55f1bc4..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#pragma once
-#include "./scalar_multiplication.hpp"
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-namespace grumpkin {
-namespace scalar_multiplication {
-
-inline size_t point_table_size(size_t num_points)
-{
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t prefetch_overflow = 16 * num_threads;
-
-    return 2 * num_points + prefetch_overflow;
-}
-
-template <typename T> inline size_t point_table_buf_size(size_t num_points)
-{
-    return sizeof(T) * point_table_size(num_points);
-}
-
-template <typename T> inline T* point_table_alloc(size_t num_points)
-{
-    return (T*)aligned_alloc(64, point_table_buf_size<T>(num_points));
-}
-
-class Pippenger {
-  public:
-    /**
-     * Expects points to be buffer of size as per point_table_size().
-     * It expects the crs to start at points[1], and it fills in affine_one at points[0].
-     * The crs undergoes a byteswap, and then the point table is generated.
-     */
-    Pippenger(g1::affine_element* points, size_t num_points);
-
-    Pippenger(uint8_t const* points, size_t num_points);
-
-    Pippenger(std::string const& path, size_t num_points);
-
-    ~Pippenger();
-
-    g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range);
-
-    g1::affine_element* get_point_table() const { return monomials_; }
-
-    size_t get_num_points() const { return num_points_; }
-
-  private:
-    g1::affine_element* monomials_;
-    size_t num_points_;
-};
-
-} // namespace scalar_multiplication
-} // namespace grumpkin
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp
deleted file mode 100644
index f56bdaa936..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "process_buckets.hpp"
-
-#include <array>
-
-namespace grumpkin {
-namespace scalar_multiplication {
-void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept
-{
-    constexpr size_t num_bits = 8;
-    constexpr size_t num_buckets = 1UL << num_bits;
-    constexpr uint32_t mask = static_cast<uint32_t>(num_buckets) - 1U;
-    std::array<uint32_t, num_buckets> bucket_counts{};
-
-    for (size_t i = 0; i < num_entries; ++i) {
-        bucket_counts[(keys[i] >> shift) & mask]++;
-    }
-
-    std::array<uint32_t, num_buckets + 1> offsets;
-    std::array<uint32_t, num_buckets + 1> offsets_copy;
-    offsets[0] = 0;
-
-    for (size_t i = 0; i < num_buckets - 1; ++i) {
-        bucket_counts[i + 1] += bucket_counts[i];
-    }
-    for (size_t i = 1; i < num_buckets + 1; ++i) {
-        offsets[i] = bucket_counts[i - 1];
-    }
-    for (size_t i = 0; i < num_buckets + 1; ++i) {
-        offsets_copy[i] = offsets[i];
-    }
-    uint64_t* start = &keys[0];
-
-    for (size_t i = 0; i < num_buckets; ++i) {
-        uint64_t* bucket_start = &keys[offsets[i]];
-        const uint64_t* bucket_end = &keys[offsets_copy[i + 1]];
-        while (bucket_start != bucket_end) {
-            for (uint64_t* it = bucket_start; it < bucket_end; ++it) {
-                const size_t value = (*it >> shift) & mask;
-                const uint64_t offset = offsets[value]++;
-                std::iter_swap(it, start + offset);
-            }
-            bucket_start = &keys[offsets[i]];
-        }
-    }
-    if (shift > 0) {
-        for (size_t i = 0; i < num_buckets; ++i) {
-            if (offsets_copy[i + 1] - offsets_copy[i] > 1) {
-                radix_sort(&keys[offsets_copy[i]], offsets_copy[i + 1] - offsets_copy[i], shift - 8);
-            }
-        }
-    }
-}
-
-void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept
-{
-    const uint32_t bits_per_round = 8;
-    const uint32_t base = num_bits & 7;
-    const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8;
-    const uint32_t shift = total_bits - bits_per_round;
-
-    radix_sort(wnaf_entries, num_entries, shift);
-}
-} // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp
deleted file mode 100644
index d4ef31da06..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace grumpkin {
-namespace scalar_multiplication {
-void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept;
-
-void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept;
-} // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp
deleted file mode 100644
index 36d894eafa..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-#include "runtime_states.hpp"
-
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-#include "barretenberg/numeric/bitop/get_msb.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-namespace grumpkin {
-namespace scalar_multiplication {
-
-pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points)
-{
-    constexpr size_t MAX_NUM_ROUNDS = 256;
-    num_points = num_initial_points * 2;
-    const size_t num_points_floor = static_cast<size_t>(1ULL << (numeric::get_msb(num_points)));
-    const size_t num_buckets = static_cast<size_t>(
-        1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_initial_points)));
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t prefetch_overflow = 16 * num_threads;
-    const size_t num_rounds =
-        static_cast<size_t>(grumpkin::scalar_multiplication::get_num_rounds(static_cast<size_t>(num_points_floor)));
-    point_schedule = (uint64_t*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t)));
-    skew_table = (bool*)(aligned_alloc(64, pad(static_cast<size_t>(num_points) * sizeof(bool), 64)));
-    point_pairs_1 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    point_pairs_2 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    scratch_space = (fq*)(aligned_alloc(64, static_cast<size_t>(num_points) * sizeof(g1::affine_element)));
-    bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
-    bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
-    bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool)));
-    round_counts = (uint64_t*)(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t)));
-
-    const size_t points_per_thread = static_cast<size_t>(num_points) / num_threads;
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_threads; ++i) {
-        const size_t thread_offset = i * points_per_thread;
-        memset((void*)(point_pairs_1 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(point_pairs_2 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq));
-        for (size_t j = 0; j < num_rounds; ++j) {
-            const size_t round_offset = (j * static_cast<size_t>(num_points));
-            memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t));
-        }
-        memset((void*)(skew_table + thread_offset), 0, points_per_thread * sizeof(bool));
-    }
-
-    memset((void*)bucket_counts, 0, num_threads * num_buckets * sizeof(uint32_t));
-    memset((void*)bit_counts, 0, num_threads * num_buckets * sizeof(uint32_t));
-    memset((void*)bucket_empty_status, 0, num_threads * num_buckets * sizeof(bool));
-    memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t));
-}
-
-pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other)
-{
-    point_schedule = other.point_schedule;
-    skew_table = other.skew_table;
-    point_pairs_1 = other.point_pairs_1;
-    point_pairs_2 = other.point_pairs_2;
-    scratch_space = other.scratch_space;
-    bit_counts = other.bit_counts;
-    bucket_counts = other.bucket_counts;
-    bucket_empty_status = other.bucket_empty_status;
-    round_counts = other.round_counts;
-
-    other.point_schedule = nullptr;
-    other.skew_table = nullptr;
-    other.point_pairs_1 = nullptr;
-    other.point_pairs_2 = nullptr;
-    other.scratch_space = nullptr;
-    other.bit_counts = nullptr;
-    other.bucket_counts = nullptr;
-    other.bucket_empty_status = nullptr;
-    other.round_counts = nullptr;
-
-    num_points = other.num_points;
-}
-
-pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other)
-{
-    if (point_schedule) {
-        aligned_free(point_schedule);
-    }
-
-    if (skew_table) {
-        aligned_free(skew_table);
-    }
-
-    if (point_pairs_1) {
-        aligned_free(point_pairs_1);
-    }
-
-    if (point_pairs_2) {
-        aligned_free(point_pairs_2);
-    }
-
-    if (scratch_space) {
-        aligned_free(scratch_space);
-    }
-
-    if (bit_counts) {
-        aligned_free(bit_counts);
-    }
-
-    if (bucket_counts) {
-        aligned_free(bucket_counts);
-    }
-
-    if (bucket_empty_status) {
-        aligned_free(bucket_empty_status);
-    }
-
-    if (round_counts) {
-        aligned_free(round_counts);
-    }
-
-    point_schedule = other.point_schedule;
-    skew_table = other.skew_table;
-    point_pairs_1 = other.point_pairs_1;
-    point_pairs_2 = other.point_pairs_2;
-    scratch_space = other.scratch_space;
-    bit_counts = other.bit_counts;
-    bucket_counts = other.bucket_counts;
-    bucket_empty_status = other.bucket_empty_status;
-    round_counts = other.round_counts;
-
-    other.point_schedule = nullptr;
-    other.skew_table = nullptr;
-    other.point_pairs_1 = nullptr;
-    other.point_pairs_2 = nullptr;
-    other.scratch_space = nullptr;
-    other.bit_counts = nullptr;
-    other.bucket_counts = nullptr;
-    other.bucket_empty_status = nullptr;
-    other.round_counts = nullptr;
-
-    num_points = other.num_points;
-    return *this;
-}
-
-affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads,
-                                                                                       const size_t thread_index)
-{
-    const size_t points_per_thread = static_cast<size_t>(num_points / num_threads);
-    const size_t num_buckets = static_cast<size_t>(
-        1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
-
-    scalar_multiplication::affine_product_runtime_state product_state;
-
-    product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16);
-    product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16);
-    product_state.scratch_space = scratch_space + (thread_index * (points_per_thread / 2));
-    product_state.bucket_counts = bucket_counts + (thread_index * (num_buckets));
-    product_state.bit_offsets = bit_counts + (thread_index * (num_buckets));
-    product_state.bucket_empty_status = bucket_empty_status + (thread_index * (num_buckets));
-    return product_state;
-}
-
-pippenger_runtime_state::~pippenger_runtime_state()
-{
-    if (point_schedule) {
-        aligned_free(point_schedule);
-    }
-
-    if (skew_table) {
-        aligned_free(skew_table);
-    }
-
-    if (point_pairs_1) {
-        aligned_free(point_pairs_1);
-    }
-
-    if (point_pairs_2) {
-        aligned_free(point_pairs_2);
-    }
-
-    if (scratch_space) {
-        aligned_free(scratch_space);
-    }
-
-    if (bit_counts) {
-        aligned_free(bit_counts);
-    }
-
-    if (bucket_counts) {
-        aligned_free(bucket_counts);
-    }
-
-    if (bucket_empty_status) {
-        aligned_free(bucket_empty_status);
-    }
-
-    if (round_counts) {
-        aligned_free(round_counts);
-    }
-}
-} // namespace scalar_multiplication
-} // namespace grumpkin
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp
deleted file mode 100644
index 050c955c8c..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-
-#include "../grumpkin.hpp"
-#include "barretenberg/ecc/groups/wnaf.hpp"
-
-namespace grumpkin {
-// simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm.
-// This is to eliminate page faults when allocating (and writing) to large tranches of memory.
-namespace scalar_multiplication {
-constexpr size_t get_optimal_bucket_width(const size_t num_points)
-{
-    if (num_points >= 14617149) {
-        return 21;
-    }
-    if (num_points >= 1139094) {
-        return 18;
-    }
-    // if (num_points >= 100000)
-    if (num_points >= 155975) {
-        return 15;
-    }
-    if (num_points >= 144834)
-    // if (num_points >= 100000)
-    {
-        return 14;
-    }
-    if (num_points >= 25067) {
-        return 12;
-    }
-    if (num_points >= 13926) {
-        return 11;
-    }
-    if (num_points >= 7659) {
-        return 10;
-    }
-    if (num_points >= 2436) {
-        return 9;
-    }
-    if (num_points >= 376) {
-        return 7;
-    }
-    if (num_points >= 231) {
-        return 6;
-    }
-    if (num_points >= 97) {
-        return 5;
-    }
-    if (num_points >= 35) {
-        return 4;
-    }
-    if (num_points >= 10) {
-        return 3;
-    }
-    if (num_points >= 2) {
-        return 2;
-    }
-    return 1;
-}
-
-constexpr size_t get_num_rounds(const size_t num_points)
-{
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-    return WNAF_SIZE(bits_per_bucket + 1);
-}
-
-// WORKTODO: uniformize
-struct affine_product_runtime_state {
-    g1::affine_element* points;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
-    uint32_t* bucket_counts;
-    uint32_t* bit_offsets;
-    uint64_t* point_schedule;
-    uint32_t num_points;
-    uint32_t num_buckets;
-    bool* bucket_empty_status;
-};
-
-struct pippenger_runtime_state {
-    uint64_t* point_schedule;
-    bool* skew_table;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
-    uint32_t* bucket_counts;
-    uint32_t* bit_counts;
-    bool* bucket_empty_status;
-    uint64_t* round_counts;
-    uint64_t num_points;
-
-    pippenger_runtime_state(const size_t num_initial_points);
-    pippenger_runtime_state(pippenger_runtime_state&& other);
-    pippenger_runtime_state& operator=(pippenger_runtime_state&& other);
-    ~pippenger_runtime_state();
-
-    affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index);
-};
-} // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp
deleted file mode 100644
index d61158b143..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp
+++ /dev/null
@@ -1,947 +0,0 @@
-#include "./scalar_multiplication.hpp"
-
-#include "barretenberg/common/throw_or_abort.hpp"
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/max_threads.hpp"
-#include "barretenberg/numeric/bitop/get_msb.hpp"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-
-#include "../../../groups/wnaf.hpp"
-#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
-#include "barretenberg/ecc/curves/bn254/fq.hpp"
-#include "barretenberg/ecc/curves/bn254/fr.hpp"
-#include "barretenberg/ecc/curves/bn254/g1.hpp"
-#include "./process_buckets.hpp"
-#include "./runtime_states.hpp"
-
-#ifndef NO_MULTITHREADING
-#include <omp.h>
-#endif
-
-#define BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK                                                                        \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 16] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 17] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 18] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 19] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 20] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 21] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 22] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 23] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 24] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 25] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 26] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 27] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 28] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 29] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 30] >> 32ULL));                              \
-    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 31] >> 32ULL));                              \
-                                                                                                                       \
-    uint64_t schedule_a = state.point_schedule[schedule_it];                                                           \
-    uint64_t schedule_b = state.point_schedule[schedule_it + 1];                                                       \
-    uint64_t schedule_c = state.point_schedule[schedule_it + 2];                                                       \
-    uint64_t schedule_d = state.point_schedule[schedule_it + 3];                                                       \
-    uint64_t schedule_e = state.point_schedule[schedule_it + 4];                                                       \
-    uint64_t schedule_f = state.point_schedule[schedule_it + 5];                                                       \
-    uint64_t schedule_g = state.point_schedule[schedule_it + 6];                                                       \
-    uint64_t schedule_h = state.point_schedule[schedule_it + 7];                                                       \
-    uint64_t schedule_i = state.point_schedule[schedule_it + 8];                                                       \
-    uint64_t schedule_j = state.point_schedule[schedule_it + 9];                                                       \
-    uint64_t schedule_k = state.point_schedule[schedule_it + 10];                                                      \
-    uint64_t schedule_l = state.point_schedule[schedule_it + 11];                                                      \
-    uint64_t schedule_m = state.point_schedule[schedule_it + 12];                                                      \
-    uint64_t schedule_n = state.point_schedule[schedule_it + 13];                                                      \
-    uint64_t schedule_o = state.point_schedule[schedule_it + 14];                                                      \
-    uint64_t schedule_p = state.point_schedule[schedule_it + 15];                                                      \
-                                                                                                                       \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL);     \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
-        state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 10,                                           \
-                                  (schedule_k >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 11,                                           \
-                                  (schedule_l >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 12,                                           \
-                                  (schedule_m >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 13,                                           \
-                                  (schedule_n >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 14,                                           \
-                                  (schedule_o >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 15,                                           \
-                                  (schedule_p >> 31ULL) & 1ULL);                                                       \
-                                                                                                                       \
-    current_offset += 16;                                                                                              \
-    schedule_it += 16;
-
-namespace grumpkin {
-namespace scalar_multiplication {
-
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points)
-{
-    // iterate backwards, so that `points` and `table` can point to the same memory location
-    fq beta = fq::cube_root_of_unity();
-    for (size_t i = num_points - 1; i < num_points; --i) {
-        table[i * 2] = points[i];
-        table[i * 2 + 1].x = beta * points[i].x;
-        table[i * 2 + 1].y = -points[i].y;
-    }
-}
-
-/**
- * Compute the windowed-non-adjacent-form versions of our scalar multipliers.
- *
- * We start by splitting our 254 bit scalars into 2 127-bit scalars, using the short weierstrass curve endomorphism
- * (for a point P \in \G === (x, y) \in \Fq, then (\beta x, y) = (\lambda) * P , where \beta = 1^{1/3} mod Fq and
- *\lambda = 1^{1/3} mod Fr) (which means we can represent a scalar multiplication (k * P) as (k1 * P + k2 * \lambda *
- *P), where k1, k2 have 127 bits) (see field::split_into_endomorphism_scalars for more details)
- *
- * Once we have our 127-bit scalar multipliers, we determine the optimal number of pippenger rounds, given the number of
- *points we're multiplying. Once we have the number of rounds, `m`, we need to split our scalar into `m` bit-slices.
- *Each pippenger round will work on one bit-slice.
- *
- * Pippenger's algorithm works by, for each round, iterating over the points we're multplying. For each point, we
- *examing the point's scalar multiplier and extract the bit-slice associated with the current pippenger round (we start
- *with the most significant slice). We then use the bit-slice to index a 'bucket', which we add the point into. For
- *example, if the bit slice is 01101, we add the corresponding point into bucket[13].
- *
- * At the end of each pippenger round we concatenate the buckets together. E.g. if we have 8 buckets, we compute:
- * sum = bucket[0] + 2 * bucket[1] + 3 * bucket[2] + 4 * bucket[3] + 5 * bucket[4] + 6 * bucket[5] + 7 * bucket[6] + 8 *
- *bucket[7].
- *
- * At the end of each pippenger round, the bucket sum will contain the scalar multiplication result for one bit slice.
- * For example, say we have 16 rounds, where each bit slice contains 8 bits (8 * 16 = 128, enough to represent our 127
- *bit scalars). At the end of the first round, we will have taken the 8 most significant bits from every scalar
- *multiplier. Our bucket sum will be the result of a mini-scalar-multiplication, where we have multiplied every point by
- *the 8 most significant bits of each point's scalar multiplier.
- *
- * We repeat this process for every pippenger round. In our example, this gives us 16 bucket sums.
- * We need to multiply the most significant bucket sum by 2^{120}, the second most significant bucket sum by 2^{112}
- *etc. Once this is done we can add the bucket sums together, to evaluate our scalar multiplication result.
- *
- * Pippenger has complexity O(n / logn), because of two factors at play: the number of buckets we need to concatenate
- *per round, and the number of points we need to add into buckets per round.
- *
- * To minimize the number of point additions per round, we want fewer rounds. But fewer rounds increases the number of
- *bucket concatenations. The more points we have, the greater the time saving when reducing the number of rounds, which
- *means we can afford to have more buckets per round.
- *
- * For a concrete example, with 2^20 points, the sweet spot is 2^15 buckets - with 2^15 buckets we can evaluate our 127
- *bit scalar multipliers in 8 rounds (we can represent b-bit windows with 2^{b-1} buckets, more on that below).
- *
- * This means that, for each round, we add 2^21 points into buckets (we've split our scalar multpliers into two
- *half-width multipliers, so each round has twice the number of points. This is the reason why the endormorphism is
- *useful here; without the endomorphism, we would need twice the number of buckets for each round).
- *
- * We also concatenate 2^15 buckets for each round. This requires 2^16 point additions.
- *
- * Meaning that the total number of point additions is (8 * 2^21) + (8 * 2^16) = 33 * 2^19 ~ 2^24 point additions.
- * If we were to use a simple Montgomery double-and-add ladder to exponentiate each point, we would need 2^27 point
- *additions (each scalar multiplier has ~2^7 non-zero bits, and there are 2^20 points).
- *
- * This makes pippenger 8 times faster than the naive O(n) equivalent. Given that a circuit with 1 million gates will
- *require 9 multiple-scalar-multiplications with 2^20 points, efficiently using Pippenger's algorithm is essential for
- *fast provers
- *
- * One additional efficiency gain is the use of 2^{b-1} buckets to represent b bits. To do this we represent our
- *bit-slices in non-adjacent form. Non-adjacent form represents values using a base, where each 'bit' can take the
- *values (-1, 0, 1). This is considerably more efficient than binary form for scalar multiplication, as inverting a
- *point can be done by negating the y-coordinate.
- *
- * We actually use a slightly different representation than simple non-adjacent form. To represent b bits, a bit slice
- *contains values from (-2^{b} - 1, ..., -1, 1, ..., 2^{b} - 1). i.e. we only have odd values. We do this to eliminate
- *0-valued windows, as having a conditional branch in our hot loop to check if an entry is 0 is somethin we want to
- *avoid.
- *
- * The above representation can be used to represent any binary number as long as we add a 'skew' factor. Each scalar
- *multiplier's `skew` tracks if the scalar multiplier is even or odd. If it's even, `skew = true`, and we add `1` to our
- *multiplier to make it odd.
- *
- * We then, at the end of the Pippenger algorithm, subtract a point from the total result, if that point's skew is
- *`true`.
- *
- * At the end of `compute_wnaf_states`, `state.wnaf_table` will contain our wnaf entries, but unsorted.
- *
- * @param point_schedule Pointer to the output array with all WNAFs
- * @param input_skew_table Pointer to the output array with all skews
- * @param round_counts The number of points in each round
- * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF
- * @param num_initial_points The number of points before the endomorphism split
- **/
-void compute_wnaf_states(uint64_t* point_schedule,
-                         bool* input_skew_table,
-                         uint64_t* round_counts,
-                         const fr* scalars,
-                         const size_t num_initial_points)
-{
-    const size_t num_points = num_initial_points * 2;
-    constexpr size_t MAX_NUM_ROUNDS = 256;
-    constexpr size_t MAX_NUM_THREADS = 128;
-    const size_t num_rounds = get_num_rounds(num_points);
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_initial_points);
-    const size_t wnaf_bits = bits_per_bucket + 1;
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t num_initial_points_per_thread = num_initial_points / num_threads;
-    const size_t num_points_per_thread = num_points / num_threads;
-    std::array<std::array<uint64_t, MAX_NUM_ROUNDS>, MAX_NUM_THREADS> thread_round_counts;
-    for (size_t i = 0; i < num_threads; ++i) {
-        for (size_t j = 0; j < num_rounds; ++j) {
-            thread_round_counts[i][j] = 0;
-        }
-    }
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_threads; ++i) {
-        fr T0;
-        uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread];
-        const fr* thread_scalars = &scalars[i * num_initial_points_per_thread];
-        bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread];
-        uint64_t offset = i * num_points_per_thread;
-
-        for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) {
-            T0 = thread_scalars[j].from_montgomery_form();
-            fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]);
-
-            barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[0],
-                                                       &wnaf_table[(j << 1UL)],
-                                                       skew_table[j << 1ULL],
-                                                       &thread_round_counts[i][0],
-                                                       ((j << 1ULL) + offset) << 32ULL,
-                                                       num_points,
-                                                       wnaf_bits);
-            barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[2],
-                                                       &wnaf_table[(j << 1UL) + 1],
-                                                       skew_table[(j << 1UL) + 1],
-                                                       &thread_round_counts[i][0],
-                                                       ((j << 1UL) + offset + 1) << 32UL,
-                                                       num_points,
-                                                       wnaf_bits);
-        }
-    }
-
-    for (size_t i = 0; i < num_rounds; ++i) {
-        round_counts[i] = 0;
-    }
-    for (size_t i = 0; i < num_threads; ++i) {
-        for (size_t j = 0; j < num_rounds; ++j) {
-            round_counts[j] += thread_round_counts[i][j];
-        }
-    }
-}
-
-/**
- *  Sorts our wnaf entries in increasing bucket order (per round).
- *  We currently don't multi-thread the inner sorting algorithm, and just split our threads over the number of rounds.
- *  A multi-threaded sorting algorithm could be more efficient, but the total runtime of `organize_buckets` is <5% of
- *  pippenger's runtime, so not a priority.
- **/
-void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t num_points)
-{
-    const size_t num_rounds = get_num_rounds(num_points);
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t i = 0; i < num_rounds; ++i) {
-        scalar_multiplication::process_buckets(&point_schedule[i * num_points],
-                                               num_points,
-                                               static_cast<uint32_t>(get_optimal_bucket_width(num_points / 2)) + 1);
-    }
-}
-
-/**
- * adds a bunch of points together using affine addition formulae.
- * Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to perform.
- * Affine formula:
- *
- * \lambda = (y_2 - y_1) / (x_2 - x_1)
- * x_3 = \lambda^2 - (x_2 + x_1)
- * y_3 = \lambda*(x_1 - x_3) - y_1
- *
- * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse,
- * which is outrageously expensive.
- *
- * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0.
- *
- * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \}
- * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products.
- * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n]
- * We then compute a single inverse: I = 1 / x_1x_2...x_n
- * Finally, we can use our accumulated products, to quotient out individual inverses.
- * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1)
- * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we
- * require.
- *
- * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse.
- * Which means we can compute a point addition with SIX field multiplications in total.
- * The traditional Jacobian-coordinate formula requires 11.
- *
- * There is a catch though - we need large sequences of independent point additions!
- * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the sequence.
- *
- * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated
- **/
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space)
-{
-    fq batch_inversion_accumulator = fq::one();
-
-    for (size_t i = 0; i < num_points; i += 2) {
-        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
-        points[i + 1].x -= points[i].x;                        // x2 - x1
-        points[i + 1].y -= points[i].y;                        // y2 - y1
-        points[i + 1].y *= batch_inversion_accumulator;        // (y2 - y1)*accumulator_old
-        batch_inversion_accumulator *= (points[i + 1].x);
-    }
-
-    if (batch_inversion_accumulator == 0) {
-        throw_or_abort("attempted to invert zero in add_affine_points");
-    } else {
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-    }
-
-    for (size_t i = (num_points)-2; i < num_points; i -= 2) {
-        // Memory bandwidth is a bit of a bottleneck here.
-        // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching
-        __builtin_prefetch(points + i - 2);
-        __builtin_prefetch(points + i - 1);
-        __builtin_prefetch(points + ((i + num_points - 2) >> 1));
-        __builtin_prefetch(scratch_space + ((i - 2) >> 1));
-
-        points[i + 1].y *= batch_inversion_accumulator; // update accumulator
-        batch_inversion_accumulator *= points[i + 1].x;
-        points[i + 1].x = points[i + 1].y.sqr();
-        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
-                                                                                     // - x1
-        points[i].x -= points[(i + num_points) >> 1].x;
-        points[i].x *= points[i + 1].y;
-        points[(i + num_points) >> 1].y = points[i].x - points[i].y;
-    }
-}
-
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space)
-{
-    fq batch_inversion_accumulator = fq::one();
-
-    for (size_t i = 0; i < num_points; i += 2) {
-        if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) {
-            continue;
-        }
-        if (points[i].x == points[i + 1].x) {
-            if (points[i].y == points[i + 1].y) {
-                // double
-                scratch_space[i >> 1] = points[i].x + points[i].x; // 2x
-                fq x_squared = points[i].x.sqr();
-                points[i + 1].x = points[i].y + points[i].y;         // 2y
-                points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2
-                points[i + 1].y *= batch_inversion_accumulator;
-                batch_inversion_accumulator *= (points[i + 1].x);
-                continue;
-            }
-            points[i].self_set_infinity();
-            points[i + 1].self_set_infinity();
-            continue;
-        }
-
-        scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
-        points[i + 1].x -= points[i].x;                        // x2 - x1
-        points[i + 1].y -= points[i].y;                        // y2 - y1
-        points[i + 1].y *= batch_inversion_accumulator;        // (y2 - y1)*accumulator_old
-        batch_inversion_accumulator *= (points[i + 1].x);
-    }
-    if (!batch_inversion_accumulator.is_zero()) {
-        batch_inversion_accumulator = batch_inversion_accumulator.invert();
-    }
-    for (size_t i = (num_points)-2; i < num_points; i -= 2) {
-        // Memory bandwidth is a bit of a bottleneck here.
-        // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching
-        __builtin_prefetch(points + i - 2);
-        __builtin_prefetch(points + i - 1);
-        __builtin_prefetch(points + ((i + num_points - 2) >> 1));
-        __builtin_prefetch(scratch_space + ((i - 2) >> 1));
-
-        if (points[i].is_point_at_infinity()) {
-            points[(i + num_points) >> 1] = points[i + 1];
-            continue;
-        }
-        if (points[i + 1].is_point_at_infinity()) {
-            points[(i + num_points) >> 1] = points[i];
-            continue;
-        }
-
-        points[i + 1].y *= batch_inversion_accumulator; // update accumulator
-        batch_inversion_accumulator *= points[i + 1].x;
-        points[i + 1].x = points[i + 1].y.sqr();
-        points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
-                                                                                     // - x1
-        points[i].x -= points[(i + num_points) >> 1].x;
-        points[i].x *= points[i + 1].y;
-        points[(i + num_points) >> 1].y = points[i].x - points[i].y;
-    }
-}
-
-/**
- * evaluate a chain of pairwise additions.
- * The additions are sequenced into base-2 segments
- * i.e. pairs, pairs of pairs, pairs of pairs of pairs etc
- * `max_bucket_bits` indicates the largest set of nested pairs in the array,
- * which defines the iteration depth
- **/
-void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases)
-{
-    size_t end = state.num_points;
-    size_t start = 0;
-    for (size_t i = 0; i < max_bucket_bits; ++i) {
-        const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i);
-        start = end - points_in_round;
-        if (handle_edge_cases) {
-            add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space);
-        } else {
-            add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space);
-        }
-    }
-}
-
-/**
- * This is the entry point for our 'find a way of evaluating a giant multi-product using affine coordinates' algorithm
- * By this point, we have already sorted our pippenger buckets. So we have the following situation:
- *
- * 1. We have a defined number of buckets points
- * 2. We have a defined number of points, that need to be added into these bucket points
- * 3. number of points >> number of buckets
- *
- * The algorithm begins by counting the number of points assigned to each bucket.
- * For each bucket, we then take this count and split it into its base-2 components.
- * e.g. if bucket[3] has 14 points, we split that into a sequence of (8, 4, 2)
- * This base-2 splitting is useful, because we can take the bucket's associated points, and
- * sort them into pairs, quads, octs etc. These mini-addition sequences are independent from one another,
- * which means that we can use the affine trick to evaluate them.
- * Once we're done, we have effectively reduced the number of points in the bucket to a logarithmic factor of the input.
- * e.g. in the above example, once we've evaluated our pairwise addition of 8, 4 and 2 elements,
- *      we're left with 3 points.
- * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points.
- * We repeat this process until every bucket only has one point assigned to it.
- **/
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases)
-{
-
-    // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now();
-    // This method sorts our points into our required base-2 sequences.
-    // `max_bucket_bits` is log2(maximum bucket count).
-    // This sets the upper limit on how many iterations we need to perform in `evaluate_addition_chains`.
-    // e.g. if `max_bucket_bits == 3`, then we have at least one bucket with >= 8 points in it.
-    // which means we need to repeat our pairwise addition algorithm 3 times
-    // (e.g. add 4 pairs together to get 2 pairs, add those pairs together to get a single pair, which we add to reduce
-    // to our final point)
-    const size_t max_bucket_bits = construct_addition_chains(state, first_round);
-
-    // if max_bucket_bits is 0, we're done! we can return
-    if (max_bucket_bits == 0) {
-        return state.point_pairs_1;
-    }
-
-    // compute our required additions using the affine trick
-    evaluate_addition_chains(state, max_bucket_bits, handle_edge_cases);
-
-    // this next step is a processing step, that computes a new point schedule for our reduced points.
-    // In the pippenger algorithm, we use a 64-bit uint to categorize each point.
-    // The high 32 bits describes the position of the point in a point array.
-    // The low 31 bits describes the bucket index that the point maps to
-    // The 32nd bit defines whether the point is actually a negation of our stored point.
-
-    // We want to compute these 'point schedule' uints for our reduced points, so that we can recurse back into
-    // `reduce_buckets`
-    uint32_t start = 0;
-    const uint32_t end = static_cast<uint32_t>(state.num_points);
-    // The output of `evaluate_addition_chains` has a bit of an odd structure, should probably refactor.
-    // Effectively, we used to have one big 1d array, and the act of computing these pair-wise point additions
-    // has chopped it up into sequences of smaller 1d arrays, with gaps in between
-    for (size_t i = 0; i < max_bucket_bits; ++i) {
-        const uint32_t points_in_round =
-            (static_cast<uint32_t>(state.num_points) - state.bit_offsets[i + 1]) >> static_cast<uint32_t>(i);
-        const uint32_t points_removed = points_in_round / 2;
-
-        start = end - points_in_round;
-        const uint32_t modified_start = start + points_removed;
-        state.bit_offsets[i + 1] = modified_start;
-    }
-
-    // iterate over each bucket. Identify how many remaining points there are, and compute their point scheduels
-    uint32_t new_num_points = 0;
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        uint32_t& count = state.bucket_counts[i];
-        uint32_t num_bits = numeric::get_msb(count) + 1;
-        uint32_t new_bucket_count = 0;
-        for (size_t j = 0; j < num_bits; ++j) {
-            uint32_t& current_offset = state.bit_offsets[j];
-            const bool has_entry = ((count >> j) & 1) == 1;
-            if (has_entry) {
-                uint64_t schedule = (static_cast<uint64_t>(current_offset) << 32ULL) + i;
-                state.point_schedule[new_num_points++] = schedule;
-                ++new_bucket_count;
-                ++current_offset;
-            }
-        }
-        count = new_bucket_count;
-    }
-
-    // modify `num_points` to reflect the new number of reduced points.
-    // also swap around the `point_pairs` pointer; what used to be our temporary array
-    // has now become our input point array
-    g1::affine_element* temp = state.point_pairs_1;
-    state.num_points = new_num_points;
-    state.points = state.point_pairs_1;
-    state.point_pairs_1 = state.point_pairs_2;
-    state.point_pairs_2 = temp;
-
-    // We could probably speed this up by unroling the recursion.
-    // But each extra call to `reduce_buckets` has an input size that is ~log(previous input size)
-    // so the extra run-time is meh
-    return reduce_buckets(state, false, handle_edge_cases);
-}
-
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts)
-{
-    // if this is the first call to `construct_addition_chains`, we need to count up our buckets
-    if (empty_bucket_counts) {
-        memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets);
-        const uint32_t first_bucket = static_cast<uint32_t>(state.point_schedule[0] & 0x7fffffffUL);
-        for (size_t i = 0; i < state.num_points; ++i) {
-            size_t bucket_index = static_cast<size_t>(state.point_schedule[i] & 0x7fffffffUL);
-            ++state.bucket_counts[bucket_index - first_bucket];
-        }
-        for (size_t i = 0; i < state.num_buckets; ++i) {
-            state.bucket_empty_status[i] = (state.bucket_counts[i] == 0);
-        }
-    }
-
-    uint32_t max_count = 0;
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        max_count = state.bucket_counts[i] > max_count ? state.bucket_counts[i] : max_count;
-    }
-
-    const uint32_t max_bucket_bits = numeric::get_msb(max_count);
-
-    for (size_t i = 0; i < max_bucket_bits + 1; ++i) {
-        state.bit_offsets[i] = 0;
-    }
-
-    // theoretically, can be unrolled using templated methods.
-    // However, explicitly unrolling the loop by using recursive template calls was slower!
-    // Inner loop is currently bounded by a constexpr variable, need to see what the compiler does with that...
-    count_bits(state.bucket_counts, &state.bit_offsets[0], state.num_buckets, max_bucket_bits);
-
-    // we need to update `bit_offsets` to compute our point shuffle,
-    // but we need the original array later on, so make a copy.
-    std::array<uint32_t, 22> bit_offsets_copy = { 0 };
-    for (size_t i = 0; i < max_bucket_bits + 1; ++i) {
-        bit_offsets_copy[i] = state.bit_offsets[i];
-    }
-
-    // this is where we take each bucket's associated points, and arrange them
-    // in a pairwise order, so that we can compute large sequences of additions using the affine trick
-    size_t schedule_it = 0;
-    uint32_t* bucket_count_it = state.bucket_counts;
-
-    for (size_t i = 0; i < state.num_buckets; ++i) {
-        uint32_t count = *bucket_count_it;
-        ++bucket_count_it;
-        uint32_t num_bits = numeric::get_msb(count) + 1;
-        for (size_t j = 0; j < num_bits; ++j) {
-            uint32_t& current_offset = bit_offsets_copy[j];
-            const size_t k_end = count & (1UL << j);
-            // This section is a bottleneck - to populate our point array, we need
-            // to read from memory locations that are effectively uniformly randomly distributed!
-            // (assuming our scalar multipliers are uniformly random...)
-            // In the absence of a more elegant solution, we use ugly macro hacks to try and
-            // unroll loops, and prefetch memory a few cycles before we need it
-            switch (k_end) {
-            case 64: {
-                [[fallthrough]];
-            }
-            case 32: {
-                [[fallthrough]];
-            }
-            case 16: {
-                for (size_t k = 0; k < (k_end >> 4); ++k) {
-                    BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK;
-                }
-                break;
-            }
-            case 8: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 8] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 9] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 10] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 11] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 12] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 13] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 14] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 15] >> 32ULL));
-
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-                const uint64_t schedule_c = state.point_schedule[schedule_it + 2];
-                const uint64_t schedule_d = state.point_schedule[schedule_it + 3];
-                const uint64_t schedule_e = state.point_schedule[schedule_it + 4];
-                const uint64_t schedule_f = state.point_schedule[schedule_it + 5];
-                const uint64_t schedule_g = state.point_schedule[schedule_it + 6];
-                const uint64_t schedule_h = state.point_schedule[schedule_it + 7];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 4,
-                                              (schedule_e >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 5,
-                                              (schedule_f >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 6,
-                                              (schedule_g >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 7,
-                                              (schedule_h >> 31ULL) & 1ULL);
-
-                current_offset += 8;
-                schedule_it += 8;
-                break;
-            }
-            case 4: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-                const uint64_t schedule_c = state.point_schedule[schedule_it + 2];
-                const uint64_t schedule_d = state.point_schedule[schedule_it + 3];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
-                current_offset += 4;
-                schedule_it += 4;
-                break;
-            }
-            case 2: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-                const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                current_offset += 2;
-                schedule_it += 2;
-                break;
-            }
-            case 1: {
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL));
-                __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
-                const uint64_t schedule_a = state.point_schedule[schedule_it];
-
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                ++current_offset;
-                ++schedule_it;
-                break;
-            }
-            case 0: {
-                break;
-            }
-            default: {
-                for (size_t k = 0; k < k_end; ++k) {
-                    uint64_t schedule = state.point_schedule[schedule_it];
-                    __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 1] >> 32ULL));
-
-                    const uint64_t predicate = (schedule >> 31UL) & 1UL;
-
-                    g1::conditional_negate_affine(
-                        state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate);
-                    ++current_offset;
-                    ++schedule_it;
-                }
-            }
-            }
-        }
-    }
-    return max_bucket_bits;
-}
-
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases)
-{
-    const size_t num_rounds = get_num_rounds(num_points);
-#ifndef NO_MULTITHREADING
-    const size_t num_threads = max_threads::compute_num_threads();
-#else
-    const size_t num_threads = 1;
-#endif
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-
-    std::unique_ptr<g1::element[], decltype(&aligned_free)> thread_accumulators(
-        static_cast<g1::element*>(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free);
-
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-    for (size_t j = 0; j < num_threads; ++j) {
-        thread_accumulators[j].self_set_infinity();
-
-        for (size_t i = 0; i < num_rounds; ++i) {
-
-            const uint64_t num_round_points = state.round_counts[i];
-
-            g1::element accumulator;
-            accumulator.self_set_infinity();
-
-            if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) {
-            } else {
-
-                const uint64_t num_round_points_per_thread = num_round_points / num_threads;
-                const uint64_t leftovers =
-                    (j == num_threads - 1) ? (num_round_points) - (num_round_points_per_thread * num_threads) : 0;
-
-                uint64_t* thread_point_schedule =
-                    &state.point_schedule[(i * num_points) + j * num_round_points_per_thread];
-                const size_t first_bucket = thread_point_schedule[0] & 0x7fffffffU;
-                const size_t last_bucket =
-                    thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU;
-                const size_t num_thread_buckets = (last_bucket - first_bucket) + 1;
-
-                affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j);
-                product_state.num_points = static_cast<uint32_t>(num_round_points_per_thread + leftovers);
-                product_state.points = points;
-                product_state.point_schedule = thread_point_schedule;
-                product_state.num_buckets = static_cast<uint32_t>(num_thread_buckets);
-                g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases);
-                g1::element running_sum;
-                running_sum.self_set_infinity();
-
-                // one nice side-effect of the affine trick, is that half of the bucket concatenation
-                // algorithm can use mixed addition formulae, instead of full addition formulae
-                size_t output_it = product_state.num_points - 1;
-                for (size_t k = num_thread_buckets - 1; k > 0; --k) {
-                    if (__builtin_expect(!product_state.bucket_empty_status[k], 1)) {
-                        running_sum += (output_buckets[output_it]);
-                        --output_it;
-                    }
-                    accumulator += running_sum;
-                }
-                running_sum += output_buckets[0];
-                accumulator.self_dbl();
-                accumulator += running_sum;
-
-                // we now need to scale up 'running sum' up to the value of the first bucket.
-                // e.g. if first bucket is 0, no scaling
-                // if first bucket is 1, we need to add (2 * running_sum)
-                if (first_bucket > 0) {
-                    uint32_t multiplier = static_cast<uint32_t>(first_bucket << 1UL);
-                    size_t shift = numeric::get_msb(multiplier);
-                    g1::element rolling_accumulator = g1::point_at_infinity;
-                    bool init = false;
-                    while (shift != static_cast<size_t>(-1)) {
-                        if (init) {
-                            rolling_accumulator.self_dbl();
-                            if (((multiplier >> shift) & 1)) {
-                                rolling_accumulator += running_sum;
-                            }
-                        } else {
-                            rolling_accumulator += running_sum;
-                        }
-                        init = true;
-                        shift -= 1;
-                    }
-                    accumulator += rolling_accumulator;
-                }
-            }
-
-            if (i == (num_rounds - 1)) {
-                const size_t num_points_per_thread = num_points / num_threads;
-                bool* skew_table = &state.skew_table[j * num_points_per_thread];
-                g1::affine_element* point_table = &points[j * num_points_per_thread];
-                g1::affine_element addition_temporary;
-                for (size_t k = 0; k < num_points_per_thread; ++k) {
-                    if (skew_table[k]) {
-                        addition_temporary = -point_table[k];
-                        accumulator += addition_temporary;
-                    }
-                }
-            }
-
-            if (i > 0) {
-                for (size_t k = 0; k < bits_per_bucket + 1; ++k) {
-                    thread_accumulators[j].self_dbl();
-                }
-            }
-            thread_accumulators[j] += accumulator;
-        }
-    }
-
-    g1::element result;
-    result.self_set_infinity();
-    for (size_t i = 0; i < num_threads; ++i) {
-        result += thread_accumulators[i];
-    }
-    return result;
-}
-
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases)
-{
-    // multiplication_runtime_state state;
-    compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2);
-    g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases);
-    return result;
-}
-
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_initial_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases)
-{
-    // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points.
-    // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm.
-    // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger
-#ifndef NO_MULTITHREADING
-    const size_t threshold = std::max(max_threads::compute_num_threads() * 8, 8UL);
-#else
-    const size_t threshold = 8UL;
-#endif
-
-    if (num_initial_points == 0) {
-        g1::element out = g1::one;
-        out.self_set_infinity();
-        return out;
-    }
-
-    if (num_initial_points <= threshold) {
-        std::vector<g1::element> exponentiation_results(num_initial_points);
-        // might as well multithread this...
-        // Possible optimization: use group::batch_mul_with_endomorphism here.
-#ifndef NO_MULTITHREADING
-#pragma omp parallel for
-#endif
-        for (size_t i = 0; i < num_initial_points; ++i) {
-            exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i];
-        }
-
-        for (size_t i = num_initial_points - 1; i > 0; --i) {
-            exponentiation_results[i - 1] += exponentiation_results[i];
-        }
-        return exponentiation_results[0];
-    }
-
-    const size_t slice_bits = static_cast<size_t>(numeric::get_msb(static_cast<uint64_t>(num_initial_points)));
-    const size_t num_slice_points = static_cast<size_t>(1ULL << slice_bits);
-
-    g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases);
-
-    if (num_slice_points != num_initial_points) {
-        const uint64_t leftover_points = num_initial_points - num_slice_points;
-        return result + pippenger(scalars + num_slice_points,
-                                  points + static_cast<size_t>(num_slice_points * 2),
-                                  static_cast<size_t>(leftover_points),
-                                  state,
-                                  handle_edge_cases);
-    } else {
-        return result;
-    }
-}
-
-/**
- * It's pippenger! But this one has go-faster stripes and a prediliction for questionable life choices.
- * We use affine-addition formula in this method, which paradoxically is ~45% faster than the mixed addition formulae.
- * See `scalar_multiplication.cpp` for a more detailed description.
- *
- * It's...unsafe, because we assume that the incomplete addition formula exceptions are not triggered.
- * We don't bother to check for this to avoid conditional branches in a critical section of our code.
- * This is fine for situations where your bases are linearly independent (i.e. KZG10 polynomial commitments),
- * because triggering the incomplete addition exceptions is about as hard as solving the disrete log problem.
- *
- * This is ok for the prover, but GIANT RED CLAXON WARNINGS FOR THE VERIFIER
- * Don't use this in a verification algorithm! That would be a really bad idea.
- * Unless you're a malicious adversary, then it would be a great idea!
- *
- **/
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state)
-{
-    return pippenger(scalars, points, num_initial_points, state, false);
-}
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state)
-{
-    std::vector<g1::affine_element> G_mod(num_initial_points * 2);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points);
-    return pippenger(scalars, &G_mod[0], num_initial_points, state, false);
-}
-} // namespace scalar_multiplication
-} // namespace grumpkin
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp
deleted file mode 100644
index a9a5c9c89d..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-#pragma once
-
-#include "../grumpkin.hpp"
-#include "./runtime_states.hpp"
-#include <stddef.h>
-#include <stdint.h>
-
-namespace grumpkin {
-namespace scalar_multiplication {
-
-constexpr size_t get_num_buckets(const size_t num_points)
-{
-    const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
-    return 1UL << bits_per_bucket;
-}
-
-/**
- * pointers that describe how to add points into buckets, for the pippenger algorithm.
- * `wnaf_table` is an unrolled two-dimensional array, with each inner array being of size `n`,
- * where `n` is the number of points being multiplied. The second dimension size is defined by
- * the number of pippenger rounds (fixed for a given `n`, see `get_num_rounds`)
- *
- * An entry of `wnaf_table` contains the following three pieces of information:
- * 1: the point index that we're working on. This is stored in the high 32 bits
- * 2: the bucket index that we're adding the point into. This is stored in the low 31 bits
- * 3: the sign of the point we're adding (i.e. do we actually need to subtract). This is stored in the 32nd bit.
- *
- * We pack this information into a 64 bit unsigned integer, so that we can more efficiently sort our wnaf entries.
- * For a given round, we want to sort our wnaf entries in increasing bucket index order.
- *
- * This is so that we can efficiently use multiple threads to execute the pippenger algorithm.
- * For a given round, a given point's bucket index will be uniformly randomly distributed,
- * assuming the inputs are from a zero-knowledge proof. This is because the scalar multiplier will be uniformly randomly
- *distributed, and the bucket indices are derived from the scalar multiplier.
- *
- * This means that, if we were to iterate over all of our points in order, and add each point into its associated
- *bucket, we would be accessing all of our buckets in a completely random pattern.
- *
- * Aside from memory latency problems this incurs, this makes the naive algorithm unsuitable for multithreading - we
- *cannot assign a thread a tranche of points, because each thread will be adding points into the same set of buckets,
- *triggering race conditions. We do not want to manage the overhead of thread locks for each bucket; the process of
- *adding a point into a bucket takes, on average, only 400 CPU cycles, so the slowdown of managing mutex locks would add
- *considerable overhead.
- *
- * The solution is to sort the buckets. If the buckets are sorted, we can assign a tranche of buckets to individual
- *threads, safe in the knowledge that there will be no race conditions, with one condition. A thread's starting bucket
- *may be equal to the previous thread's end bucket, so we need to ensure that each thread works on a local array of
- *buckets. This adds little overhead (for 2^20 points, we have 32,768 buckets. With 8 threads, the amount of bucket
- *overlap is ~16 buckets, so we could incur 16 extra 'additions' in pippenger's bucket concatenation phase, but this is
- *an insignificant contribution).
- *
- * The alternative approach (the one we used to use) is to slice up all of the points being multiplied amongst all
- *available threads, and run the complete pippenger algorithm for each thread. This is suboptimal, because the
- *complexity of pippenger is O(n / logn) point additions, and a sequence of smaller pippenger calls will have a smaller
- *`n`.
- *
- * This is the motivation for multi-threading the actual Pippenger algorithm. In addition, the above approach performs
- *extremely poorly for GPUs, where the number of threads can be as high as 2^10 (for a multi-scalar-multiplication of
- *2^20 points, this doubles the number of pippenger rounds per thread)
- *
- * To give concrete numbers, the difference between calling pippenger on 2^20 points, and calling pippenger 8 times on
- *2^17 points, is 5-10%. Which means that, for 8 threads, we need to ensure that our sorting algorithm adds less than 5%
- *to the total runtime of pippenger. Given a single cache miss per point would increase the run-time by 25%, this is not
- *much room to work with!
- *
- * However, a radix sort, combined with the fact that the total number of buckets is quite small (2^16 at most), seems
- *to be fast enough. Benchmarks indicate (i7-8650U, 8 threads) that, for 2^20 points, the total runtime is <1200ms and
- *of that, the radix sort consumes 58ms (4.8%)
- *
- * One advantage of sorting by bucket order vs point order, is that a 'bucket' is 96 bytes large (sizeof(g1::element),
- *buckets have z-coordinates). Points, on the other hand, are 64 bytes large (affine points, no z-coordinate). This
- *makes fetching random point locations in memory more efficient than fetching random bucket locations, as each point
- *occupies a single cache line. Using __builtin_prefetch to recover the point just before it's needed, seems to improve
- *the runtime of pippenger by 10-20%.
- *
- * Finally, `skew_table` tracks whether a scalar multplier is even or odd
- * (if it's even, we need to subtract the point from the total result,
- * because our windowed non-adjacent form values can only be odd)
- *
- **/
-
-struct multiplication_thread_state {
-    g1::element* buckets;
-    const uint64_t* point_schedule;
-};
-
-void compute_wnaf_states(uint64_t* point_schedule,
-                         bool* input_skew_table,
-                         uint64_t* round_counts,
-                         const fr* scalars,
-                         const size_t num_initial_points);
-
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points);
-
-void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points);
-
-inline void count_bits(uint32_t* bucket_counts,
-                       uint32_t* bit_offsets,
-                       const uint32_t num_buckets,
-                       const size_t num_bits)
-{
-    for (size_t i = 0; i < num_buckets; ++i) {
-        const uint32_t count = bucket_counts[i];
-        for (uint32_t j = 0; j < num_bits; ++j) {
-            bit_offsets[j + 1] += (count & (1U << j));
-        }
-    }
-    bit_offsets[0] = 0;
-    for (size_t i = 2; i < num_bits + 1; ++i) {
-        bit_offsets[i] += bit_offsets[i - 1];
-    }
-}
-
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true);
-
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space);
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space);
-
-void evaluate_addition_chains(affine_product_runtime_state& state,
-                              const size_t max_bucket_bits,
-                              bool handle_edge_cases);
-
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases);
-
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases = false);
-
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state,
-                                   bool first_round = true,
-                                   bool handle_edge_cases = false);
-
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases = true);
-
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state);
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state);
-
-} // namespace scalar_multiplication
-} // namespace grumpkin
diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp
deleted file mode 100644
index fa2e5f15c6..0000000000
--- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp
+++ /dev/null
@@ -1,946 +0,0 @@
-#include <chrono>
-#include <fstream>
-#include <vector>
-
-#include "pippenger.hpp"
-#include "scalar_multiplication.hpp"
-#include "barretenberg/common/mem.hpp"
-#include "barretenberg/common/test.hpp"
-#include "barretenberg/srs/io.hpp"
-#include "barretenberg/numeric/random/engine.hpp"
-
-// paths are relative to cpp/build/
-std::string GRUMPKIN_SRS_PATH = "../srs_db/grumpkin";
-
-using namespace grumpkin;
-using namespace grumpkin::scalar_multiplication;
-
-namespace {
-auto& engine = numeric::random::get_debug_engine();
-}
-
-TEST(grumpkin_scalar_multiplication, fake_transcript_io)
-{
-    size_t file_num = 0;
-    std::string transcript_path = io::get_transcript_path(GRUMPKIN_SRS_PATH, file_num);
-
-    std::vector<grumpkin::g1::affine_element> srs(3);
-    grumpkin::io::read_transcript_g1(&srs[0], /*degree=*/3, GRUMPKIN_SRS_PATH);
-    // the SRS is [x^i]_1 where x = 2
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one), srs[0]);
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one + g1::one), srs[1]);
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one + g1::one + g1::one + g1::one), srs[2]);
-}
-
-TEST(grumpkin_scalar_multiplication, reduce_buckets_simple)
-{
-    constexpr size_t num_points = 128;
-    auto pippenger = Pippenger(GRUMPKIN_SRS_PATH, num_points / 2);
-    auto monomials = pippenger.get_point_table();
-
-    std::vector<uint64_t> point_schedule(scalar_multiplication::point_table_size(num_points / 2));
-    std::array<bool, num_points> bucket_empty_status;
-
-    std::array<uint64_t, num_points> transcript;
-    std::array<uint64_t, num_points> transcript_points;
-    transcript_points[0] = 0x0;
-    transcript_points[1] = 0x2;
-    transcript_points[2] = 0x4;
-    transcript_points[3] = 0x6;
-    transcript_points[4] = 0xb;
-    transcript_points[5] = 0xc;
-    transcript_points[6] = 0xe;
-    transcript_points[7] = 0x11;
-    transcript_points[8] = 0x13;
-    transcript_points[9] = 0x14;
-    transcript_points[10] = 0x15;
-    transcript_points[11] = 0x16;
-    transcript_points[12] = 0x17;
-    transcript_points[13] = 0x18;
-    transcript_points[14] = 0x20;
-    transcript_points[15] = 0x21;
-    transcript_points[16] = 0x22;
-    transcript_points[17] = 0x27;
-    transcript_points[18] = 0x29;
-    transcript_points[19] = 0x2b;
-    transcript_points[20] = 0x2c;
-    transcript_points[21] = 0x2d;
-    transcript_points[22] = 0x2e;
-    transcript_points[23] = 0x36;
-    transcript_points[24] = 0x37;
-    transcript_points[25] = 0x38;
-    transcript_points[26] = 0x3e;
-    transcript_points[27] = 0x3f;
-    transcript_points[28] = 0x4e;
-    transcript_points[29] = 0x4f;
-    transcript_points[30] = 0x50;
-    transcript_points[31] = 0x51;
-    transcript_points[32] = 0x41;
-    transcript_points[33] = 0x52;
-    transcript_points[34] = 0x53;
-    transcript_points[35] = 0x54;
-    transcript_points[36] = 0x43;
-    transcript_points[37] = 0x57;
-    transcript_points[38] = 0x46;
-    transcript_points[39] = 0x58;
-    transcript_points[40] = 0x5b;
-    transcript_points[41] = 0x5e;
-    transcript_points[42] = 0x42;
-    transcript_points[43] = 0x47;
-    transcript_points[44] = 0x4b;
-    transcript_points[45] = 0x4d;
-    transcript_points[46] = 0x6b;
-    transcript_points[47] = 0x65;
-    transcript_points[48] = 0x6d;
-    transcript_points[49] = 0x67;
-    transcript_points[50] = 0x6f;
-    transcript_points[51] = 0x68;
-    transcript_points[52] = 0x69;
-    transcript_points[53] = 0x6a;
-    transcript_points[54] = 0x71;
-    transcript_points[55] = 0x72;
-    transcript_points[56] = 0x73;
-    transcript_points[57] = 0x74;
-    transcript_points[58] = 0x75;
-    transcript_points[59] = 0x66;
-    transcript_points[60] = 0x79;
-    transcript_points[62] = 0x7c;
-    transcript_points[61] = 0x7e;
-    transcript_points[63] = 0x7f;
-    transcript_points[64] = 0x1;
-    transcript_points[65] = 0x3;
-    transcript_points[66] = 0x5;
-    transcript_points[67] = 0x7;
-    transcript_points[68] = 0x8;
-    transcript_points[69] = 0x9;
-    transcript_points[70] = 0xa;
-    transcript_points[71] = 0xd;
-    transcript_points[72] = 0xf;
-    transcript_points[73] = 0x10;
-    transcript_points[74] = 0x12;
-    transcript_points[75] = 0x19;
-    transcript_points[76] = 0x1a;
-    transcript_points[77] = 0x1b;
-    transcript_points[78] = 0x1c;
-    transcript_points[79] = 0x1d;
-    transcript_points[80] = 0x1e;
-    transcript_points[81] = 0x1f;
-    transcript_points[82] = 0x23;
-    transcript_points[83] = 0x24;
-    transcript_points[84] = 0x25;
-    transcript_points[85] = 0x26;
-    transcript_points[86] = 0x28;
-    transcript_points[87] = 0x2a;
-    transcript_points[88] = 0x2f;
-    transcript_points[89] = 0x30;
-    transcript_points[90] = 0x31;
-    transcript_points[91] = 0x32;
-    transcript_points[92] = 0x33;
-    transcript_points[93] = 0x34;
-    transcript_points[94] = 0x35;
-    transcript_points[95] = 0x39;
-    transcript_points[96] = 0x3a;
-    transcript_points[97] = 0x3b;
-    transcript_points[98] = 0x3c;
-    transcript_points[99] = 0x3d;
-    transcript_points[100] = 0x48;
-    transcript_points[101] = 0x49;
-    transcript_points[102] = 0x55;
-    transcript_points[103] = 0x56;
-    transcript_points[104] = 0x4a;
-    transcript_points[105] = 0x44;
-    transcript_points[106] = 0x45;
-    transcript_points[107] = 0x40;
-    transcript_points[108] = 0x59;
-    transcript_points[109] = 0x5a;
-    transcript_points[110] = 0x5c;
-    transcript_points[111] = 0x5d;
-    transcript_points[112] = 0x5f;
-    transcript_points[113] = 0x60;
-    transcript_points[114] = 0x61;
-    transcript_points[115] = 0x62;
-    transcript_points[116] = 0x63;
-    transcript_points[117] = 0x4c;
-    transcript_points[118] = 0x6c;
-    transcript_points[119] = 0x6e;
-    transcript_points[120] = 0x64;
-    transcript_points[121] = 0x70;
-    transcript_points[122] = 0x77;
-    transcript_points[123] = 0x78;
-    transcript_points[124] = 0x76;
-    transcript_points[125] = 0x7a;
-    transcript_points[126] = 0x7b;
-    transcript_points[127] = 0x7d;
-
-    for (size_t i = 0; i < 64; ++i) {
-        transcript[i] = 0;
-        transcript[i + 64] = 1;
-    }
-    for (size_t i = 0; i < num_points; ++i) {
-        point_schedule[i] = (static_cast<uint64_t>(transcript_points[i]) << 32ULL) + transcript[i];
-    }
-    std::array<g1::element, num_points> expected;
-    for (size_t i = 0; i < num_points; ++i) {
-        expected[i].self_set_infinity();
-    }
-
-    for (size_t i = 0; i < num_points; ++i) {
-        size_t schedule = transcript[i] & 0x7fffffffU;
-        {
-            expected[schedule] += monomials[static_cast<size_t>(transcript_points[i])];
-        }
-    }
-
-    std::array<g1::affine_element, num_points> point_pairs;
-    std::array<g1::affine_element, num_points> output_buckets;
-    std::array<fq, num_points> scratch_space;
-    std::array<uint32_t, num_points> bucket_counts;
-    std::array<uint32_t, num_points> bit_offsets = { 0 };
-
-    scalar_multiplication::affine_product_runtime_state product_state{
-        &monomials[0],          &point_pairs[0],   &output_buckets[0],
-        &scratch_space[0],      &bucket_counts[0], &bit_offsets[0],
-        &point_schedule[0],     num_points,        2,
-        &bucket_empty_status[0]
-    };
-
-    g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true);
-
-    for (size_t i = 0; i < product_state.num_buckets; ++i) {
-        expected[i] = expected[i].normalize();
-        EXPECT_EQ((output[i].x == expected[i].x), true);
-        EXPECT_EQ((output[i].y == expected[i].y), true);
-    }
-}
-
-TEST(grumpkin_scalar_multiplication, reduce_buckets)
-{
-    constexpr size_t num_initial_points = 1 << 12;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::affine_element* scratch_points =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::affine_element* point_pairs =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2)));
-    g1::element* expected_buckets = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points * 2)));
-    bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points * 2)));
-
-    memset((void*)scratch_points, 0x00, (num_points * 2) * sizeof(g1::affine_element));
-    memset((void*)point_pairs, 0x00, (num_points * 2) * sizeof(g1::affine_element));
-    memset((void*)expected_buckets, 0x00, (num_points * 2) * sizeof(g1::element));
-    memset((void*)bucket_empty_status, 0x00, (num_points * 2) * sizeof(bool));
-
-    fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points)));
-
-    memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
-
-    // WORKTODO: unify by using 0 g2 elts
-    grumpkin::io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
-
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        scalars[i] = fr::random_element();
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * 100 * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-
-    uint64_t* point_schedule_copy = static_cast<uint64_t*>(aligned_alloc(64, sizeof(uint64_t) * num_points * 2));
-    for (size_t i = 0; i < num_points; ++i) {
-        state.point_schedule[i + num_points] = state.point_schedule[i + num_points] & 0xffffffff7fffffffUL;
-        // printf("state.point_schedule[%lu] = %lx \n", i, state.point_schedule[i]);
-        point_schedule_copy[i] = state.point_schedule[i + num_points];
-    }
-    const size_t first_bucket = point_schedule_copy[0] & 0x7fffffffULL;
-    const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       &state.point_schedule[num_points],
-                                                                       num_points,
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    for (size_t i = 0; i < num_points; ++i) {
-        expected_buckets[i].self_set_infinity();
-    }
-    for (size_t i = 0; i < num_points; ++i) {
-        uint64_t schedule = point_schedule_copy[i];
-        uint64_t bucket_index = schedule & 0x7fffffffU;
-        uint64_t point_index = schedule >> 32ULL;
-        uint64_t predicate = (schedule >> 31ULL) & 1ULL;
-        // printf("expected bucket index = %lu \n", bucket_index - first_bucket);
-        g1::element& bucket = expected_buckets[bucket_index - first_bucket];
-        g1::affine_element& point = monomials[point_index];
-        bucket.self_mixed_add_or_sub(point, predicate);
-    }
-
-    size_t it = 0;
-
-    g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true);
-
-    printf("num buckets = %zu \n", num_buckets);
-    for (size_t i = 0; i < num_buckets; ++i) {
-        if (!bucket_empty_status[i]) {
-            g1::element expected = expected_buckets[i].normalize();
-            EXPECT_EQ((expected.x == result_buckets[it].x), true);
-            EXPECT_EQ((expected.y == result_buckets[it].y), true);
-            ++it;
-        } else {
-            printf("recorded empty bucket???\n");
-        }
-    }
-    aligned_free(bucket_empty_status);
-    aligned_free(expected_buckets);
-    aligned_free(point_schedule_copy);
-    aligned_free(point_pairs);
-    aligned_free(scratch_points);
-    aligned_free(scratch_field);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-// This test intermittenly fails.
-TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
-{
-    constexpr size_t num_initial_points = 1 << 20;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    g1::affine_element* scratch_points =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    g1::affine_element* point_pairs =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points)));
-
-    fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points)));
-
-    memset((void*)scratch_points, 0x00, num_points * sizeof(g1::affine_element));
-    memset((void*)point_pairs, 0x00, num_points * sizeof(g1::affine_element));
-    memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
-    memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool));
-
-    io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-    const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL;
-    const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       (uint32_t)state.round_counts[0],
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::reduce_buckets(product_state, true);
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    aligned_free(bucket_empty_status);
-    aligned_free(point_pairs);
-    aligned_free(scratch_points);
-    aligned_free(scratch_field);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-TEST(grumpkin_scalar_multiplication, add_affine_points)
-{
-    constexpr size_t num_points = 20;
-    g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-    fq* scratch_space = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2)));
-    fq* lambda = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2)));
-
-    g1::element* points_copy = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points)));
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-        points_copy[i].x = points[i].x;
-        points_copy[i].y = points[i].y;
-        points_copy[i].z = fq::one();
-    }
-
-    size_t count = num_points - 1;
-    for (size_t i = num_points - 2; i < num_points; i -= 2) {
-        points_copy[count--] = points_copy[i] + points_copy[i + 1];
-        points_copy[count + 1] = points_copy[count + 1].normalize();
-    }
-
-    scalar_multiplication::add_affine_points(points, num_points, scratch_space);
-    for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) {
-        EXPECT_EQ((points[i].x == points_copy[i].x), true);
-        EXPECT_EQ((points[i].y == points_copy[i].y), true);
-    }
-    aligned_free(lambda);
-    aligned_free(points);
-    aligned_free(points_copy);
-    aligned_free(scratch_space);
-}
-
-TEST(grumpkin_scalar_multiplication, construct_addition_chains)
-{
-    constexpr size_t num_initial_points = 1 << 20;
-    constexpr size_t num_points = num_initial_points * 2;
-    g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
-
-    io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < num_initial_points; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
-
-    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
-    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-    std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "wnaf time: " << diff.count() << "ms" << std::endl;
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl;
-    const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2);
-    bool* bucket_empty_status = static_cast<bool*>(aligned_alloc(64, num_points * sizeof(bool)));
-    uint32_t* bucket_counts = static_cast<uint32_t*>(aligned_alloc(64, max_num_buckets * sizeof(uint32_t)));
-    memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t));
-    std::array<uint32_t, 22> bit_offsets = { 0 };
-    const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL;
-    const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL;
-    const size_t num_buckets = last_bucket - first_bucket + 1;
-
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       monomials,
-                                                                       monomials,
-                                                                       nullptr,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       num_points,
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
-
-    start = std::chrono::steady_clock::now();
-    scalar_multiplication::construct_addition_chains(product_state, true);
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
-    end = std::chrono::steady_clock::now();
-    diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
-
-    aligned_free(bucket_empty_status);
-    aligned_free(scalars);
-    aligned_free(monomials);
-    aligned_free(bucket_counts);
-}
-
-TEST(grumpkin_scalar_multiplication, endomorphism_split)
-{
-    fr scalar = fr::random_element();
-
-    g1::element expected = g1::one * scalar;
-
-    // we want to test that we can split a scalar into two half-length components, using the same location in memory.
-    fr* k1_t = &scalar;
-    fr* k2_t = (fr*)&scalar.data[2];
-
-    fr::split_into_endomorphism_scalars(scalar, *k1_t, *k2_t);
-    // The compiler really doesn't like what we're doing here,
-    // and disabling the array-bounds error project-wide seems unsafe.
-    // The large macro blocks are here to warn that we should be careful when
-    // aliasing the arguments to split_into_endomorphism_scalars
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-    fr k1{ (*k1_t).data[0], (*k1_t).data[1], 0, 0 };
-    fr k2{ (*k2_t).data[0], (*k2_t).data[1], 0, 0 };
-#if !defined(__clang__) && defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-    g1::element result;
-    g1::element t1 = g1::affine_one * k1;
-    g1::affine_element generator = g1::affine_one;
-    fq beta = fq::cube_root_of_unity();
-    generator.x = generator.x * beta;
-    generator.y = -generator.y;
-    g1::element t2 = generator * k2;
-    result = t1 + t2;
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, radix_sort)
-{
-    // check that our radix sort correctly sorts!
-    constexpr size_t target_degree = 1 << 8;
-    constexpr size_t num_rounds = scalar_multiplication::get_num_rounds(target_degree * 2);
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree));
-
-    fr source_scalar = fr::random_element();
-    for (size_t i = 0; i < target_degree; ++i) {
-        source_scalar.self_sqr();
-        fr::__copy(source_scalar, scalars[i]);
-    }
-
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
-    scalar_multiplication::compute_wnaf_states(
-        state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree);
-
-    uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds));
-    memcpy((void*)wnaf_copy, (void*)state.point_schedule, sizeof(uint64_t) * target_degree * 2 * num_rounds);
-
-    scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, target_degree * 2);
-    for (size_t i = 0; i < num_rounds; ++i) {
-        uint64_t* unsorted_wnaf = &wnaf_copy[i * target_degree * 2];
-        uint64_t* sorted_wnaf = &state.point_schedule[i * target_degree * 2];
-
-        const auto find_entry = [unsorted_wnaf, num_entries = target_degree * 2](auto x) {
-            for (size_t k = 0; k < num_entries; ++k) {
-                if (unsorted_wnaf[k] == x) {
-                    return true;
-                }
-            }
-            return false;
-        };
-        for (size_t j = 0; j < target_degree * 2; ++j) {
-            EXPECT_EQ(find_entry(sorted_wnaf[j]), true);
-            if (j > 0) {
-                EXPECT_EQ((sorted_wnaf[j] & 0x7fffffffU) >= (sorted_wnaf[j - 1] & 0x7fffffffU), true);
-            }
-        }
-    }
-
-    free(scalars);
-    free(wnaf_copy);
-}
-
-HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs)
-{
-    // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps.
-    // Check that this is done correctly
-    size_t transcript_degree = 1 << 20;
-    size_t target_degree = 1200000;
-    g1::affine_element* monomials =
-        (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree)));
-    io::read_transcript(monomials, transcript_degree, GRUMPKIN_SRS_PATH);
-
-    memcpy((void*)(monomials + (2 * transcript_degree)),
-           (void*)monomials,
-           ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element)));
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree);
-
-    fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree));
-
-    fr source_scalar = fr::random_element();
-    fr accumulator = source_scalar;
-    for (size_t i = 0; i < target_degree; ++i) {
-        accumulator *= source_scalar;
-        fr::__copy(accumulator, scalars[i]);
-    }
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
-
-    g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state);
-    first = first.normalize();
-
-    for (size_t i = 0; i < target_degree; ++i) {
-        scalars[i].self_neg();
-    }
-    scalar_multiplication::pippenger_runtime_state state_2(target_degree);
-
-    g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2);
-    second = second.normalize();
-
-    EXPECT_EQ((first.z == second.z), true);
-    EXPECT_EQ((first.z == fq::one()), true);
-    EXPECT_EQ((first.x == second.x), true);
-    EXPECT_EQ((first.y == -second.y), true);
-
-    aligned_free(monomials);
-    aligned_free(scalars);
-}
-
-TEST(grumpkin_scalar_multiplication, undersized_inputs)
-{
-    // we fall back to traditional scalar multiplication algorithm for small input sizes.
-    // Check this is done correctly
-    size_t num_points = 17;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl)
-{
-    constexpr size_t num_points = 128;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    g1::affine_element point = g1::affine_element(g1::element::random_element());
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = point;
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    if (!expected.is_point_at_infinity()) {
-        expected = expected.normalize();
-    }
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_short_inputs)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points = scalar_multiplication::point_table_alloc<g1::affine_element>(num_points);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-    for (size_t i = 0; i < (num_points / 4); ++i) {
-        scalars[i * 4].data[0] = engine.get_random_uint32();
-        scalars[i * 4].data[1] = engine.get_random_uint32();
-        scalars[i * 4].data[2] = engine.get_random_uint32();
-        scalars[i * 4].data[3] = engine.get_random_uint32();
-        scalars[i * 4] = scalars[i * 4].to_montgomery_form();
-        scalars[i * 4 + 1].data[0] = 0;
-        scalars[i * 4 + 1].data[1] = 0;
-        scalars[i * 4 + 1].data[2] = 0;
-        scalars[i * 4 + 1].data[3] = 0;
-        scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form();
-        scalars[i * 4 + 2].data[0] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[1] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[2] = 0;
-        scalars[i * 4 + 2].data[3] = 0;
-        scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form();
-        scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL);
-        scalars[i * 4 + 3].data[1] = 0;
-        scalars[i * 4 + 3].data[2] = 0;
-        scalars[i * 4 + 3].data[3] = 0;
-        scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form();
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_unsafe)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points = scalar_multiplication::point_table_alloc<g1::affine_element>(num_points);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs)
-{
-    constexpr size_t num_points = 8192;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-    for (size_t i = 0; i < (num_points / 4); ++i) {
-        scalars[i * 4].data[0] = engine.get_random_uint32();
-        scalars[i * 4].data[1] = engine.get_random_uint32();
-        scalars[i * 4].data[2] = engine.get_random_uint32();
-        scalars[i * 4].data[3] = engine.get_random_uint32();
-        scalars[i * 4] = scalars[i * 4].to_montgomery_form();
-        scalars[i * 4 + 1].data[0] = 0;
-        scalars[i * 4 + 1].data[1] = 0;
-        scalars[i * 4 + 1].data[2] = 0;
-        scalars[i * 4 + 1].data[3] = 0;
-        scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form();
-        scalars[i * 4 + 2].data[0] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[1] = engine.get_random_uint32();
-        scalars[i * 4 + 2].data[2] = 0;
-        scalars[i * 4 + 2].data[3] = 0;
-        scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form();
-        scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL);
-        scalars[i * 4 + 3].data[1] = 0;
-        scalars[i * 4 + 3].data[2] = 0;
-        scalars[i * 4 + 3].data[3] = 0;
-        scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form();
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_one)
-{
-    size_t num_points = 1;
-
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * 1);
-
-    g1::affine_element* points =
-        (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1);
-
-    for (size_t i = 0; i < num_points; ++i) {
-        scalars[i] = fr::random_element();
-        points[i] = g1::affine_element(g1::element::random_element());
-    }
-
-    g1::element expected;
-    expected.self_set_infinity();
-    for (size_t i = 0; i < num_points; ++i) {
-        g1::element temp = points[i] * scalars[i];
-        expected += temp;
-    }
-    expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
-    result = result.normalize();
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result == expected, true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_zero_points)
-{
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
-
-    g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1);
-
-    scalar_multiplication::pippenger_runtime_state state(0);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state);
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result.is_point_at_infinity(), true);
-}
-
-TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero)
-{
-    fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
-
-    g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1);
-
-    scalars[0] = fr::zero();
-    points[0] = g1::affine_one;
-    scalar_multiplication::generate_pippenger_point_table(points, points, 1);
-
-    scalar_multiplication::pippenger_runtime_state state(1);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state);
-
-    aligned_free(scalars);
-    aligned_free(points);
-
-    EXPECT_EQ(result.is_point_at_infinity(), true);
-}
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp
deleted file mode 100644
index b490f4973d..0000000000
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// #include "scalar_multiplication.hpp"
-// #include "pippenger.hpp"
-// #include "barretenberg/common/mem.hpp"
-
-// using namespace barretenberg;
-
-// #define WASM_EXPORT __attribute__((visibility("default")))
-
-// extern "C" {
-
-// WASM_EXPORT void* bbmalloc(size_t size)
-// {
-//     auto ptr = aligned_alloc(64, size);
-//     return ptr;
-// }
-
-// WASM_EXPORT void bbfree(void* ptr)
-// {
-//     aligned_free(ptr);
-// }
-
-// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points)
-// {
-//     auto ptr = new scalar_multiplication::Pippenger(points, num_points);
-//     return ptr;
-// }
-
-// WASM_EXPORT void delete_pippenger(void* pippenger)
-// {
-//     delete reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger);
-// }
-
-// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void*
-// result_ptr)
-// {
-//     scalar_multiplication::pippenger_runtime_state state(range);
-//     auto pippenger = reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger_ptr);
-//     auto scalars = reinterpret_cast<fr*>(scalars_ptr);
-//     auto result = reinterpret_cast<g1::element*>(result_ptr);
-//     *result = pippenger->pippenger_unsafe(scalars, from, range);
-// }
-
-// WASM_EXPORT void g1_sum(void* points_ptr, const size_t num_points, void* result_ptr)
-// {
-//     auto points = reinterpret_cast<g1::element*>(points_ptr);
-//     auto result = reinterpret_cast<g1::element*>(result_ptr);
-//     result->self_set_infinity();
-//     *result = std::accumulate(points, points + num_points, *result);
-// }
-// }
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp
deleted file mode 100644
index 60af544690..0000000000
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// #include <cstddef>
-// #include "../g1.hpp"
-
-// #define WASM_EXPORT __attribute__((visibility("default")))
-
-// extern "C" {
-
-// WASM_EXPORT void* bbmalloc(size_t size);
-
-// WASM_EXPORT void bbfree(void* ptr);
-
-// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points);
-
-// WASM_EXPORT void delete_pippenger(void* pippenger);
-
-// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void*
-// result_ptr); WASM_EXPORT void g1_sum(void* points_ptr, size_t num_points, void* result_ptr);
-// }
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp
index 08c6b62960..c32ced45c9 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp
@@ -1,44 +1,52 @@
 #include "pippenger.hpp"
 #include "barretenberg/srs/io.hpp"
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 
-Pippenger::Pippenger(g1::affine_element* points, size_t num_points)
+template <typename Curve>
+Pippenger<Curve>::Pippenger(typename Curve::AffineElement* points, size_t num_points)
     : monomials_(points)
     , num_points_(num_points)
 {
-    grumpkin::io::byteswap(&monomials_[0], num_points * 64);
-    scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
+    io::byteswap<Curve>(&monomials_[0], num_points * 64);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials_, monomials_, num_points);
 }
 
-Pippenger::Pippenger(uint8_t const* points, size_t num_points)
+template <typename Curve>
+Pippenger<Curve>::Pippenger(uint8_t const* points, size_t num_points)
     : num_points_(num_points)
 {
     monomials_ = point_table_alloc<g1::affine_element>(num_points);
 
-    grumpkin::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
+    barretenberg::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials_, monomials_, num_points);
 }
 
-Pippenger::Pippenger(std::string const& path, size_t num_points)
+template <typename Curve>
+Pippenger<Curve>::Pippenger(std::string const& path, size_t num_points)
     : num_points_(num_points)
 {
     monomials_ = point_table_alloc<g1::affine_element>(num_points);
 
-    grumpkin::io::read_transcript_g1(monomials_, num_points, path);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points);
+    barretenberg::io::read_transcript_g1<Curve>(monomials_, num_points, path);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials_, monomials_, num_points);
 }
 
-g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range)
+template <typename Curve>
+typename Curve::Element Pippenger<Curve>::pippenger_unsafe(typename Curve::ScalarField* scalars,
+                                                           size_t from,
+                                                           size_t range)
 {
-    scalar_multiplication::pippenger_runtime_state state(range);
-    return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(range);
+    return scalar_multiplication::pippenger_unsafe<Curve>(scalars, monomials_ + from * 2, range, state);
 }
 
-Pippenger::~Pippenger()
+template <typename Curve> Pippenger<Curve>::~Pippenger()
 {
     free(monomials_);
 }
 
+template class Pippenger<curve::BN254>;
+
 } // namespace scalar_multiplication
-} // namespace grumpkin
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp
index a6c55f1bc4..f9a9e1e797 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp
@@ -7,7 +7,7 @@
 #include <omp.h>
 #endif
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 
 inline size_t point_table_size(size_t num_points)
@@ -32,14 +32,17 @@ template <typename T> inline T* point_table_alloc(size_t num_points)
     return (T*)aligned_alloc(64, point_table_buf_size<T>(num_points));
 }
 
-class Pippenger {
+template <typename Curve> class Pippenger {
   public:
+    using ScalarField = typename Curve::ScalarField;
+    using Element = typename Curve::Element;
+    using AffineElement = typename Curve::AffineElement;
     /**
      * Expects points to be buffer of size as per point_table_size().
      * It expects the crs to start at points[1], and it fills in affine_one at points[0].
      * The crs undergoes a byteswap, and then the point table is generated.
      */
-    Pippenger(g1::affine_element* points, size_t num_points);
+    Pippenger(AffineElement* points, size_t num_points);
 
     Pippenger(uint8_t const* points, size_t num_points);
 
@@ -47,16 +50,18 @@ class Pippenger {
 
     ~Pippenger();
 
-    g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range);
+    Element pippenger_unsafe(ScalarField* scalars, size_t from, size_t range);
 
-    g1::affine_element* get_point_table() const { return monomials_; }
+    AffineElement* get_point_table() const { return monomials_; }
 
     size_t get_num_points() const { return num_points_; }
 
   private:
-    g1::affine_element* monomials_;
+    AffineElement* monomials_;
     size_t num_points_;
 };
 
+extern template class Pippenger<curve::BN254>;
+
 } // namespace scalar_multiplication
-} // namespace grumpkin
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp
index f56bdaa936..01f92b8673 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp
@@ -2,7 +2,7 @@
 
 #include <array>
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept
 {
@@ -61,4 +61,4 @@ void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uin
     radix_sort(wnaf_entries, num_entries, shift);
 }
 } // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
+} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp
index d4ef31da06..bde5916663 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp
@@ -3,10 +3,10 @@
 #include <cstddef>
 #include <cstdint>
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept;
 
 void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept;
 } // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
+} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp
index 36d894eafa..9e0ba1fc03 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp
@@ -8,16 +8,19 @@
 #include <omp.h>
 #endif
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 
-pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points)
+template <typename Curve> pippenger_runtime_state<Curve>::pippenger_runtime_state(const size_t num_initial_points)
 {
+    using Fq = typename Curve::BaseField;
+    using AffineElement = typename Curve::AffineElement;
+
     constexpr size_t MAX_NUM_ROUNDS = 256;
     num_points = num_initial_points * 2;
     const size_t num_points_floor = static_cast<size_t>(1ULL << (numeric::get_msb(num_points)));
     const size_t num_buckets = static_cast<size_t>(
-        1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_initial_points)));
+        1U << scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_initial_points)));
 #ifndef NO_MULTITHREADING
     const size_t num_threads = max_threads::compute_num_threads();
 #else
@@ -25,15 +28,15 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points
 #endif
     const size_t prefetch_overflow = 16 * num_threads;
     const size_t num_rounds =
-        static_cast<size_t>(grumpkin::scalar_multiplication::get_num_rounds(static_cast<size_t>(num_points_floor)));
+        static_cast<size_t>(scalar_multiplication::get_num_rounds(static_cast<size_t>(num_points_floor)));
     point_schedule = (uint64_t*)(aligned_alloc(
         64, (static_cast<size_t>(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t)));
     skew_table = (bool*)(aligned_alloc(64, pad(static_cast<size_t>(num_points) * sizeof(bool), 64)));
-    point_pairs_1 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    point_pairs_2 = (g1::affine_element*)(aligned_alloc(
-        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element)));
-    scratch_space = (fq*)(aligned_alloc(64, static_cast<size_t>(num_points) * sizeof(g1::affine_element)));
+    point_pairs_1 = (AffineElement*)(aligned_alloc(
+        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(AffineElement)));
+    point_pairs_2 = (AffineElement*)(aligned_alloc(
+        64, (static_cast<size_t>(num_points) * 2 + (num_threads * 16)) * sizeof(AffineElement)));
+    scratch_space = (Fq*)(aligned_alloc(64, static_cast<size_t>(num_points) * sizeof(AffineElement)));
     bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
     bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t)));
     bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool)));
@@ -45,13 +48,9 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points
 #endif
     for (size_t i = 0; i < num_threads; ++i) {
         const size_t thread_offset = i * points_per_thread;
-        memset((void*)(point_pairs_1 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(point_pairs_2 + thread_offset + (i * 16)),
-               0,
-               (points_per_thread + 16) * sizeof(g1::affine_element));
-        memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq));
+        memset((void*)(point_pairs_1 + thread_offset + (i * 16)), 0, (points_per_thread + 16) * sizeof(AffineElement));
+        memset((void*)(point_pairs_2 + thread_offset + (i * 16)), 0, (points_per_thread + 16) * sizeof(AffineElement));
+        memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(Fq));
         for (size_t j = 0; j < num_rounds; ++j) {
             const size_t round_offset = (j * static_cast<size_t>(num_points));
             memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t));
@@ -65,7 +64,7 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points
     memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t));
 }
 
-pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other)
+template <typename Curve> pippenger_runtime_state<Curve>::pippenger_runtime_state(pippenger_runtime_state&& other)
 {
     point_schedule = other.point_schedule;
     skew_table = other.skew_table;
@@ -90,7 +89,8 @@ pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other
     num_points = other.num_points;
 }
 
-pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other)
+template <typename Curve>
+pippenger_runtime_state<Curve>& pippenger_runtime_state<Curve>::operator=(pippenger_runtime_state<Curve>&& other)
 {
     if (point_schedule) {
         aligned_free(point_schedule);
@@ -152,14 +152,15 @@ pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_st
     return *this;
 }
 
-affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads,
-                                                                                       const size_t thread_index)
+template <typename Curve>
+affine_product_runtime_state<Curve> pippenger_runtime_state<Curve>::get_affine_product_runtime_state(
+    const size_t num_threads, const size_t thread_index)
 {
     const size_t points_per_thread = static_cast<size_t>(num_points / num_threads);
-    const size_t num_buckets = static_cast<size_t>(
-        1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
+    const size_t num_buckets =
+        static_cast<size_t>(1U << scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
 
-    scalar_multiplication::affine_product_runtime_state product_state;
+    scalar_multiplication::affine_product_runtime_state<Curve> product_state;
 
     product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16);
     product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16);
@@ -170,7 +171,7 @@ affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime
     return product_state;
 }
 
-pippenger_runtime_state::~pippenger_runtime_state()
+template <typename Curve> pippenger_runtime_state<Curve>::~pippenger_runtime_state()
 {
     if (point_schedule) {
         aligned_free(point_schedule);
@@ -208,5 +209,10 @@ pippenger_runtime_state::~pippenger_runtime_state()
         aligned_free(round_counts);
     }
 }
+
+template struct affine_product_runtime_state<curve::BN254>;
+template struct affine_product_runtime_state<curve::Grumpkin>;
+template struct pippenger_runtime_state<curve::BN254>;
+template struct pippenger_runtime_state<curve::Grumpkin>;
 } // namespace scalar_multiplication
-} // namespace grumpkin
+} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp
index 050c955c8c..b0102485f4 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
-#include "../grumpkin.hpp"
+// #include "../g1.hpp"
+#include "../grumpkin/grumpkin.hpp"
+#include "../bn254/bn254.hpp"
 #include "barretenberg/ecc/groups/wnaf.hpp"
 
-namespace grumpkin {
+namespace barretenberg {
 // simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm.
 // This is to eliminate page faults when allocating (and writing) to large tranches of memory.
 namespace scalar_multiplication {
@@ -63,12 +65,11 @@ constexpr size_t get_num_rounds(const size_t num_points)
     return WNAF_SIZE(bits_per_bucket + 1);
 }
 
-// WORKTODO: uniformize
-struct affine_product_runtime_state {
-    g1::affine_element* points;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
+template <typename Curve> struct affine_product_runtime_state {
+    typename Curve::AffineElement* points;
+    typename Curve::AffineElement* point_pairs_1;
+    typename Curve::AffineElement* point_pairs_2;
+    typename Curve::BaseField* scratch_space;
     uint32_t* bucket_counts;
     uint32_t* bit_offsets;
     uint64_t* point_schedule;
@@ -77,12 +78,12 @@ struct affine_product_runtime_state {
     bool* bucket_empty_status;
 };
 
-struct pippenger_runtime_state {
+template <typename Curve> struct pippenger_runtime_state {
     uint64_t* point_schedule;
     bool* skew_table;
-    g1::affine_element* point_pairs_1;
-    g1::affine_element* point_pairs_2;
-    fq* scratch_space;
+    typename Curve::AffineElement* point_pairs_1;
+    typename Curve::AffineElement* point_pairs_2;
+    typename Curve::BaseField* scratch_space;
     uint32_t* bucket_counts;
     uint32_t* bit_counts;
     bool* bucket_empty_status;
@@ -94,7 +95,13 @@ struct pippenger_runtime_state {
     pippenger_runtime_state& operator=(pippenger_runtime_state&& other);
     ~pippenger_runtime_state();
 
-    affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index);
+    affine_product_runtime_state<Curve> get_affine_product_runtime_state(const size_t num_threads,
+                                                                         const size_t thread_index);
 };
+
+extern template struct affine_product_runtime_state<curve::BN254>;
+extern template struct affine_product_runtime_state<curve::Grumpkin>;
+extern template struct pippenger_runtime_state<curve::BN254>;
+extern template struct pippenger_runtime_state<curve::Grumpkin>;
 } // namespace scalar_multiplication
-} // namespace grumpkin
\ No newline at end of file
+} // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp
index d61158b143..b66e775337 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp
@@ -10,11 +10,7 @@
 #include <cstdint>
 #include <cstdlib>
 
-#include "../../../groups/wnaf.hpp"
-#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
-#include "barretenberg/ecc/curves/bn254/fq.hpp"
-#include "barretenberg/ecc/curves/bn254/fr.hpp"
-#include "barretenberg/ecc/curves/bn254/g1.hpp"
+#include "../../groups/wnaf.hpp"
 #include "./process_buckets.hpp"
 #include "./runtime_states.hpp"
 
@@ -57,55 +53,59 @@
     uint64_t schedule_o = state.point_schedule[schedule_it + 14];                                                      \
     uint64_t schedule_p = state.point_schedule[schedule_it + 15];                                                      \
                                                                                                                        \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL);     \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(                                                                                     \
+    Group::conditional_negate_affine(                                                                                  \
         state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \
-    g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 10,                                           \
-                                  (schedule_k >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 11,                                           \
-                                  (schedule_l >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 12,                                           \
-                                  (schedule_m >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 13,                                           \
-                                  (schedule_n >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 14,                                           \
-                                  (schedule_o >> 31ULL) & 1ULL);                                                       \
-    g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL),                                                \
-                                  state.point_pairs_1 + current_offset + 15,                                           \
-                                  (schedule_p >> 31ULL) & 1ULL);                                                       \
+    Group::conditional_negate_affine(state.points + (schedule_k >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 10,                                        \
+                                     (schedule_k >> 31ULL) & 1ULL);                                                    \
+    Group::conditional_negate_affine(state.points + (schedule_l >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 11,                                        \
+                                     (schedule_l >> 31ULL) & 1ULL);                                                    \
+    Group::conditional_negate_affine(state.points + (schedule_m >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 12,                                        \
+                                     (schedule_m >> 31ULL) & 1ULL);                                                    \
+    Group::conditional_negate_affine(state.points + (schedule_n >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 13,                                        \
+                                     (schedule_n >> 31ULL) & 1ULL);                                                    \
+    Group::conditional_negate_affine(state.points + (schedule_o >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 14,                                        \
+                                     (schedule_o >> 31ULL) & 1ULL);                                                    \
+    Group::conditional_negate_affine(state.points + (schedule_p >> 32ULL),                                             \
+                                     state.point_pairs_1 + current_offset + 15,                                        \
+                                     (schedule_p >> 31ULL) & 1ULL);                                                    \
                                                                                                                        \
     current_offset += 16;                                                                                              \
     schedule_it += 16;
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points)
+template <typename Curve>
+void generate_pippenger_point_table(typename Curve::AffineElement* points,
+                                    typename Curve::AffineElement* table,
+                                    size_t num_points)
 {
     // iterate backwards, so that `points` and `table` can point to the same memory location
-    fq beta = fq::cube_root_of_unity();
+    using Fq = typename Curve::BaseField;
+    Fq beta = Fq::cube_root_of_unity();
     for (size_t i = num_points - 1; i < num_points; --i) {
         table[i * 2] = points[i];
         table[i * 2 + 1].x = beta * points[i].x;
@@ -193,12 +193,14 @@ void generate_pippenger_point_table(g1::affine_element* points, g1::affine_eleme
  * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF
  * @param num_initial_points The number of points before the endomorphism split
  **/
+template <typename Curve>
 void compute_wnaf_states(uint64_t* point_schedule,
                          bool* input_skew_table,
                          uint64_t* round_counts,
-                         const fr* scalars,
+                         const typename Curve::ScalarField* scalars,
                          const size_t num_initial_points)
 {
+    using Fr = typename Curve::ScalarField;
     const size_t num_points = num_initial_points * 2;
     constexpr size_t MAX_NUM_ROUNDS = 256;
     constexpr size_t MAX_NUM_THREADS = 128;
@@ -222,30 +224,30 @@ void compute_wnaf_states(uint64_t* point_schedule,
 #pragma omp parallel for
 #endif
     for (size_t i = 0; i < num_threads; ++i) {
-        fr T0;
+        Fr T0;
         uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread];
-        const fr* thread_scalars = &scalars[i * num_initial_points_per_thread];
+        const Fr* thread_scalars = &scalars[i * num_initial_points_per_thread];
         bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread];
         uint64_t offset = i * num_points_per_thread;
 
         for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) {
             T0 = thread_scalars[j].from_montgomery_form();
-            fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]);
-
-            barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[0],
-                                                       &wnaf_table[(j << 1UL)],
-                                                       skew_table[j << 1ULL],
-                                                       &thread_round_counts[i][0],
-                                                       ((j << 1ULL) + offset) << 32ULL,
-                                                       num_points,
-                                                       wnaf_bits);
-            barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[2],
-                                                       &wnaf_table[(j << 1UL) + 1],
-                                                       skew_table[(j << 1UL) + 1],
-                                                       &thread_round_counts[i][0],
-                                                       ((j << 1UL) + offset + 1) << 32UL,
-                                                       num_points,
-                                                       wnaf_bits);
+            Fr::split_into_endomorphism_scalars(T0, T0, *(Fr*)&T0.data[2]);
+
+            wnaf::fixed_wnaf_with_counts(&T0.data[0],
+                                         &wnaf_table[(j << 1UL)],
+                                         skew_table[j << 1ULL],
+                                         &thread_round_counts[i][0],
+                                         ((j << 1ULL) + offset) << 32ULL,
+                                         num_points,
+                                         wnaf_bits);
+            wnaf::fixed_wnaf_with_counts(&T0.data[2],
+                                         &wnaf_table[(j << 1UL) + 1],
+                                         skew_table[(j << 1UL) + 1],
+                                         &thread_round_counts[i][0],
+                                         ((j << 1UL) + offset + 1) << 32UL,
+                                         num_points,
+                                         wnaf_bits);
         }
     }
 
@@ -310,9 +312,13 @@ void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t nu
  *
  * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated
  **/
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space)
+template <typename Curve>
+void add_affine_points(typename Curve::AffineElement* points,
+                       const size_t num_points,
+                       typename Curve::BaseField* scratch_space)
 {
-    fq batch_inversion_accumulator = fq::one();
+    using Fq = typename Curve::BaseField;
+    Fq batch_inversion_accumulator = Fq::one();
 
     for (size_t i = 0; i < num_points; i += 2) {
         scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
@@ -347,9 +353,13 @@ void add_affine_points(g1::affine_element* points, const size_t num_points, fq*
     }
 }
 
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space)
+template <typename Curve>
+void add_affine_points_with_edge_cases(typename Curve::AffineElement* points,
+                                       const size_t num_points,
+                                       typename Curve::BaseField* scratch_space)
 {
-    fq batch_inversion_accumulator = fq::one();
+    using Fq = typename Curve::BaseField;
+    Fq batch_inversion_accumulator = Fq::one();
 
     for (size_t i = 0; i < num_points; i += 2) {
         if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) {
@@ -359,7 +369,7 @@ void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t
             if (points[i].y == points[i + 1].y) {
                 // double
                 scratch_space[i >> 1] = points[i].x + points[i].x; // 2x
-                fq x_squared = points[i].x.sqr();
+                Fq x_squared = points[i].x.sqr();
                 points[i + 1].x = points[i].y + points[i].y;         // 2y
                 points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2
                 points[i + 1].y *= batch_inversion_accumulator;
@@ -415,7 +425,10 @@ void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t
  * `max_bucket_bits` indicates the largest set of nested pairs in the array,
  * which defines the iteration depth
  **/
-void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases)
+template <typename Curve>
+void evaluate_addition_chains(affine_product_runtime_state<Curve>& state,
+                              const size_t max_bucket_bits,
+                              bool handle_edge_cases)
 {
     size_t end = state.num_points;
     size_t start = 0;
@@ -423,9 +436,9 @@ void evaluate_addition_chains(affine_product_runtime_state& state, const size_t
         const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i);
         start = end - points_in_round;
         if (handle_edge_cases) {
-            add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space);
+            add_affine_points_with_edge_cases<Curve>(state.point_pairs_1 + start, points_in_round, state.scratch_space);
         } else {
-            add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space);
+            add_affine_points<Curve>(state.point_pairs_1 + start, points_in_round, state.scratch_space);
         }
     }
 }
@@ -450,7 +463,10 @@ void evaluate_addition_chains(affine_product_runtime_state& state, const size_t
  * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points.
  * We repeat this process until every bucket only has one point assigned to it.
  **/
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases)
+template <typename Curve>
+typename Curve::AffineElement* reduce_buckets(affine_product_runtime_state<Curve>& state,
+                                              bool first_round,
+                                              bool handle_edge_cases)
 {
 
     // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now();
@@ -516,7 +532,7 @@ g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool fir
     // modify `num_points` to reflect the new number of reduced points.
     // also swap around the `point_pairs` pointer; what used to be our temporary array
     // has now become our input point array
-    g1::affine_element* temp = state.point_pairs_1;
+    typename Curve::AffineElement* temp = state.point_pairs_1;
     state.num_points = new_num_points;
     state.points = state.point_pairs_1;
     state.point_pairs_1 = state.point_pairs_2;
@@ -528,8 +544,10 @@ g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool fir
     return reduce_buckets(state, false, handle_edge_cases);
 }
 
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts)
+template <typename Curve>
+uint32_t construct_addition_chains(affine_product_runtime_state<Curve>& state, bool empty_bucket_counts)
 {
+    using Group = typename Curve::Group;
     // if this is the first call to `construct_addition_chains`, we need to count up our buckets
     if (empty_bucket_counts) {
         memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets);
@@ -615,30 +633,30 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
                 const uint64_t schedule_g = state.point_schedule[schedule_it + 6];
                 const uint64_t schedule_h = state.point_schedule[schedule_it + 7];
 
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 4,
-                                              (schedule_e >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 5,
-                                              (schedule_f >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 6,
-                                              (schedule_g >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 7,
-                                              (schedule_h >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
+                                                 state.point_pairs_1 + current_offset,
+                                                 (schedule_a >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 1,
+                                                 (schedule_b >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 2,
+                                                 (schedule_c >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 3,
+                                                 (schedule_d >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_e >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 4,
+                                                 (schedule_e >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_f >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 5,
+                                                 (schedule_f >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_g >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 6,
+                                                 (schedule_g >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_h >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 7,
+                                                 (schedule_h >> 31ULL) & 1ULL);
 
                 current_offset += 8;
                 schedule_it += 8;
@@ -654,18 +672,18 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
                 const uint64_t schedule_c = state.point_schedule[schedule_it + 2];
                 const uint64_t schedule_d = state.point_schedule[schedule_it + 3];
 
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 2,
-                                              (schedule_c >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 3,
-                                              (schedule_d >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
+                                                 state.point_pairs_1 + current_offset,
+                                                 (schedule_a >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 1,
+                                                 (schedule_b >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_c >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 2,
+                                                 (schedule_c >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_d >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 3,
+                                                 (schedule_d >> 31ULL) & 1ULL);
                 current_offset += 4;
                 schedule_it += 4;
                 break;
@@ -678,12 +696,12 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
                 const uint64_t schedule_a = state.point_schedule[schedule_it];
                 const uint64_t schedule_b = state.point_schedule[schedule_it + 1];
 
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
-                g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
-                                              state.point_pairs_1 + current_offset + 1,
-                                              (schedule_b >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
+                                                 state.point_pairs_1 + current_offset,
+                                                 (schedule_a >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL),
+                                                 state.point_pairs_1 + current_offset + 1,
+                                                 (schedule_b >> 31ULL) & 1ULL);
                 current_offset += 2;
                 schedule_it += 2;
                 break;
@@ -695,9 +713,9 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
                 __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL));
                 const uint64_t schedule_a = state.point_schedule[schedule_it];
 
-                g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
-                                              state.point_pairs_1 + current_offset,
-                                              (schedule_a >> 31ULL) & 1ULL);
+                Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL),
+                                                 state.point_pairs_1 + current_offset,
+                                                 (schedule_a >> 31ULL) & 1ULL);
                 ++current_offset;
                 ++schedule_it;
                 break;
@@ -712,7 +730,7 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
 
                     const uint64_t predicate = (schedule >> 31UL) & 1UL;
 
-                    g1::conditional_negate_affine(
+                    Group::conditional_negate_affine(
                         state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate);
                     ++current_offset;
                     ++schedule_it;
@@ -724,11 +742,14 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp
     return max_bucket_bits;
 }
 
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases)
+template <typename Curve>
+typename Curve::Element evaluate_pippenger_rounds(pippenger_runtime_state<Curve>& state,
+                                                  typename Curve::AffineElement* points,
+                                                  const size_t num_points,
+                                                  bool handle_edge_cases)
 {
+    using Element = typename Curve::Element;
+    using AffineElement = typename Curve::AffineElement;
     const size_t num_rounds = get_num_rounds(num_points);
 #ifndef NO_MULTITHREADING
     const size_t num_threads = max_threads::compute_num_threads();
@@ -737,8 +758,8 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
 #endif
     const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2);
 
-    std::unique_ptr<g1::element[], decltype(&aligned_free)> thread_accumulators(
-        static_cast<g1::element*>(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free);
+    std::unique_ptr<Element[], decltype(&aligned_free)> thread_accumulators(
+        static_cast<Element*>(aligned_alloc(64, num_threads * sizeof(Element))), &aligned_free);
 
 #ifndef NO_MULTITHREADING
 #pragma omp parallel for
@@ -750,7 +771,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
 
             const uint64_t num_round_points = state.round_counts[i];
 
-            g1::element accumulator;
+            Element accumulator;
             accumulator.self_set_infinity();
 
             if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) {
@@ -767,13 +788,14 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
                     thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU;
                 const size_t num_thread_buckets = (last_bucket - first_bucket) + 1;
 
-                affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j);
+                affine_product_runtime_state<Curve> product_state =
+                    state.get_affine_product_runtime_state(num_threads, j);
                 product_state.num_points = static_cast<uint32_t>(num_round_points_per_thread + leftovers);
                 product_state.points = points;
                 product_state.point_schedule = thread_point_schedule;
                 product_state.num_buckets = static_cast<uint32_t>(num_thread_buckets);
-                g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases);
-                g1::element running_sum;
+                AffineElement* output_buckets = reduce_buckets(product_state, true, handle_edge_cases);
+                Element running_sum;
                 running_sum.self_set_infinity();
 
                 // one nice side-effect of the affine trick, is that half of the bucket concatenation
@@ -796,7 +818,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
                 if (first_bucket > 0) {
                     uint32_t multiplier = static_cast<uint32_t>(first_bucket << 1UL);
                     size_t shift = numeric::get_msb(multiplier);
-                    g1::element rolling_accumulator = g1::point_at_infinity;
+                    Element rolling_accumulator = g1::point_at_infinity;
                     bool init = false;
                     while (shift != static_cast<size_t>(-1)) {
                         if (init) {
@@ -817,8 +839,8 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
             if (i == (num_rounds - 1)) {
                 const size_t num_points_per_thread = num_points / num_threads;
                 bool* skew_table = &state.skew_table[j * num_points_per_thread];
-                g1::affine_element* point_table = &points[j * num_points_per_thread];
-                g1::affine_element addition_temporary;
+                AffineElement* point_table = &points[j * num_points_per_thread];
+                AffineElement addition_temporary;
                 for (size_t k = 0; k < num_points_per_thread; ++k) {
                     if (skew_table[k]) {
                         addition_temporary = -point_table[k];
@@ -836,7 +858,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
         }
     }
 
-    g1::element result;
+    Element result;
     result.self_set_infinity();
     for (size_t i = 0; i < num_threads; ++i) {
         result += thread_accumulators[i];
@@ -844,25 +866,31 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
     return result;
 }
 
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases)
+template <typename Curve>
+typename Curve::Element pippenger_internal(typename Curve::AffineElement* points,
+                                           typename Curve::ScalarField* scalars,
+                                           const size_t num_initial_points,
+                                           pippenger_runtime_state<Curve>& state,
+                                           bool handle_edge_cases)
 {
     // multiplication_runtime_state state;
-    compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
+    compute_wnaf_states<Curve>(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
     organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2);
-    g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases);
+    typename Curve::Element result =
+        evaluate_pippenger_rounds<Curve>(state, points, num_initial_points * 2, handle_edge_cases);
     return result;
 }
 
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_initial_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases)
+template <typename Curve>
+typename Curve::Element pippenger(typename Curve::ScalarField* scalars,
+                                  typename Curve::AffineElement* points,
+                                  const size_t num_initial_points,
+                                  pippenger_runtime_state<Curve>& state,
+                                  bool handle_edge_cases)
 {
+    using Group = typename Curve::Group;
+    using Element = typename Curve::Element;
+
     // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points.
     // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm.
     // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger
@@ -873,20 +901,20 @@ g1::element pippenger(fr* scalars,
 #endif
 
     if (num_initial_points == 0) {
-        g1::element out = g1::one;
+        Element out = Group::one;
         out.self_set_infinity();
         return out;
     }
 
     if (num_initial_points <= threshold) {
-        std::vector<g1::element> exponentiation_results(num_initial_points);
+        std::vector<Element> exponentiation_results(num_initial_points);
         // might as well multithread this...
         // Possible optimization: use group::batch_mul_with_endomorphism here.
 #ifndef NO_MULTITHREADING
 #pragma omp parallel for
 #endif
         for (size_t i = 0; i < num_initial_points; ++i) {
-            exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i];
+            exponentiation_results[i] = Element(points[i * 2]) * scalars[i];
         }
 
         for (size_t i = num_initial_points - 1; i > 0; --i) {
@@ -898,7 +926,7 @@ g1::element pippenger(fr* scalars,
     const size_t slice_bits = static_cast<size_t>(numeric::get_msb(static_cast<uint64_t>(num_initial_points)));
     const size_t num_slice_points = static_cast<size_t>(1ULL << slice_bits);
 
-    g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases);
+    Element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases);
 
     if (num_slice_points != num_initial_points) {
         const uint64_t leftover_points = num_initial_points - num_slice_points;
@@ -927,21 +955,72 @@ g1::element pippenger(fr* scalars,
  * Unless you're a malicious adversary, then it would be a great idea!
  *
  **/
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state)
+template <typename Curve>
+typename Curve::Element pippenger_unsafe(typename Curve::ScalarField* scalars,
+                                         typename Curve::AffineElement* points,
+                                         const size_t num_initial_points,
+                                         pippenger_runtime_state<Curve>& state)
 {
     return pippenger(scalars, points, num_initial_points, state, false);
 }
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state)
+
+template <typename Curve>
+typename Curve::Element pippenger_without_endomorphism_basis_points(typename Curve::ScalarField* scalars,
+                                                                    typename Curve::AffineElement* points,
+                                                                    const size_t num_initial_points,
+                                                                    pippenger_runtime_state<Curve>& state)
 {
-    std::vector<g1::affine_element> G_mod(num_initial_points * 2);
-    grumpkin::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points);
+    std::vector<typename Curve::AffineElement> G_mod(num_initial_points * 2);
+    barretenberg::scalar_multiplication::generate_pippenger_point_table<Curve>(points, &G_mod[0], num_initial_points);
     return pippenger(scalars, &G_mod[0], num_initial_points, state, false);
 }
+
+// Explicit instantiation
+template uint32_t construct_addition_chains<curve::BN254>(affine_product_runtime_state<curve::BN254>& state,
+                                                          bool empty_bucket_counts = true);
+
+template void add_affine_points<curve::BN254>(curve::BN254::AffineElement* points,
+                                              const size_t num_points,
+                                              curve::BN254::BaseField* scratch_space);
+
+template void add_affine_points_with_edge_cases<curve::BN254>(curve::BN254::AffineElement* points,
+                                                              const size_t num_points,
+                                                              curve::BN254::BaseField* scratch_space);
+
+template void evaluate_addition_chains<curve::BN254>(affine_product_runtime_state<curve::BN254>& state,
+                                                     const size_t max_bucket_bits,
+                                                     bool handle_edge_cases);
+template curve::BN254::Element pippenger_internal<curve::BN254>(curve::BN254::AffineElement* points,
+                                                                curve::BN254::ScalarField* scalars,
+                                                                const size_t num_initial_points,
+                                                                pippenger_runtime_state<curve::BN254>& state,
+                                                                bool handle_edge_cases);
+
+template curve::BN254::Element evaluate_pippenger_rounds<curve::BN254>(pippenger_runtime_state<curve::BN254>& state,
+                                                                       curve::BN254::AffineElement* points,
+                                                                       const size_t num_points,
+                                                                       bool handle_edge_cases = false);
+
+template curve::BN254::AffineElement* reduce_buckets<curve::BN254>(affine_product_runtime_state<curve::BN254>& state,
+                                                                   bool first_round = true,
+                                                                   bool handle_edge_cases = false);
+
+template curve::BN254::Element pippenger<curve::BN254>(curve::BN254::ScalarField* scalars,
+                                                       curve::BN254::AffineElement* points,
+                                                       const size_t num_points,
+                                                       pippenger_runtime_state<curve::BN254>& state,
+                                                       bool handle_edge_cases = true);
+
+template curve::BN254::Element pippenger_unsafe<curve::BN254>(curve::BN254::ScalarField* scalars,
+                                                              curve::BN254::AffineElement* points,
+                                                              const size_t num_initial_points,
+                                                              pippenger_runtime_state<curve::BN254>& state);
+
+template curve::BN254::Element pippenger_without_endomorphism_basis_points<curve::BN254>(
+    curve::BN254::ScalarField* scalars,
+    curve::BN254::AffineElement* points,
+    const size_t num_initial_points,
+    pippenger_runtime_state<curve::BN254>& state);
+
 } // namespace scalar_multiplication
-} // namespace grumpkin
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp
index a9a5c9c89d..eed65f8646 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp
@@ -1,11 +1,12 @@
 #pragma once
 
-#include "../grumpkin.hpp"
+#include "../bn254/bn254.hpp"
+#include "../grumpkin/grumpkin.hpp"
 #include "./runtime_states.hpp"
 #include <stddef.h>
 #include <stdint.h>
 
-namespace grumpkin {
+namespace barretenberg {
 namespace scalar_multiplication {
 
 constexpr size_t get_num_buckets(const size_t num_points)
@@ -79,18 +80,22 @@ constexpr size_t get_num_buckets(const size_t num_points)
  *
  **/
 
-struct multiplication_thread_state {
-    g1::element* buckets;
+template <typename Curve> struct multiplication_thread_state {
+    typename Curve::Element* buckets;
     const uint64_t* point_schedule;
 };
 
+template <typename Curve>
 void compute_wnaf_states(uint64_t* point_schedule,
                          bool* input_skew_table,
                          uint64_t* round_counts,
-                         const fr* scalars,
+                         const typename Curve::ScalarField* scalars,
                          const size_t num_initial_points);
 
-void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points);
+template <typename Curve>
+void generate_pippenger_point_table(typename Curve::AffineElement* points,
+                                    typename Curve::AffineElement* table,
+                                    size_t num_points);
 
 void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points);
 
@@ -111,44 +116,106 @@ inline void count_bits(uint32_t* bucket_counts,
     }
 }
 
-uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true);
+template <typename Curve>
+uint32_t construct_addition_chains(affine_product_runtime_state<Curve>& state, bool empty_bucket_counts = true);
 
-void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space);
-void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space);
+template <typename Curve>
+void add_affine_points(typename Curve::AffineElement* points,
+                       const size_t num_points,
+                       typename Curve::BaseField* scratch_space);
 
-void evaluate_addition_chains(affine_product_runtime_state& state,
+template <typename Curve>
+void add_affine_points_with_edge_cases(typename Curve::AffineElement* points,
+                                       const size_t num_points,
+                                       typename Curve::BaseField* scratch_space);
+
+template <typename Curve>
+void evaluate_addition_chains(affine_product_runtime_state<Curve>& state,
                               const size_t max_bucket_bits,
                               bool handle_edge_cases);
-
-g1::element pippenger_internal(g1::affine_element* points,
-                               fr* scalars,
-                               const size_t num_initial_points,
-                               pippenger_runtime_state& state,
-                               bool handle_edge_cases);
-
-g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state,
-                                      g1::affine_element* points,
-                                      const size_t num_points,
-                                      bool handle_edge_cases = false);
-
-g1::affine_element* reduce_buckets(affine_product_runtime_state& state,
-                                   bool first_round = true,
-                                   bool handle_edge_cases = false);
-
-g1::element pippenger(fr* scalars,
-                      g1::affine_element* points,
-                      const size_t num_points,
-                      pippenger_runtime_state& state,
-                      bool handle_edge_cases = true);
-
-g1::element pippenger_unsafe(fr* scalars,
-                             g1::affine_element* points,
-                             const size_t num_initial_points,
-                             pippenger_runtime_state& state);
-g1::element pippenger_without_endomorphism_basis_points(fr* scalars,
-                                                        g1::affine_element* points,
-                                                        const size_t num_initial_points,
-                                                        pippenger_runtime_state& state);
+template <typename Curve>
+typename Curve::Element pippenger_internal(typename Curve::AffineElement* points,
+                                           typename Curve::ScalarField* scalars,
+                                           const size_t num_initial_points,
+                                           pippenger_runtime_state<Curve>& state,
+                                           bool handle_edge_cases);
+
+template <typename Curve>
+typename Curve::Element evaluate_pippenger_rounds(pippenger_runtime_state<Curve>& state,
+                                                  typename Curve::AffineElement* points,
+                                                  const size_t num_points,
+                                                  bool handle_edge_cases = false);
+
+template <typename Curve>
+typename Curve::AffineElement* reduce_buckets(affine_product_runtime_state<Curve>& state,
+                                              bool first_round = true,
+                                              bool handle_edge_cases = false);
+
+template <typename Curve>
+typename Curve::Element pippenger(typename Curve::ScalarField* scalars,
+                                  typename Curve::AffineElement* points,
+                                  const size_t num_points,
+                                  pippenger_runtime_state<Curve>& state,
+                                  bool handle_edge_cases = true);
+
+template <typename Curve>
+typename Curve::Element pippenger_unsafe(typename Curve::ScalarField* scalars,
+                                         typename Curve::AffineElement* points,
+                                         const size_t num_initial_points,
+                                         pippenger_runtime_state<Curve>& state);
+
+template <typename Curve>
+typename Curve::Element pippenger_without_endomorphism_basis_points(typename Curve::ScalarField* scalars,
+                                                                    typename Curve::AffineElement* points,
+                                                                    const size_t num_initial_points,
+                                                                    pippenger_runtime_state<Curve>& state);
+
+// Explicit instantiation
+extern template uint32_t construct_addition_chains<curve::BN254>(affine_product_runtime_state<curve::BN254>& state,
+                                                                 bool empty_bucket_counts = true);
+
+extern template void add_affine_points<curve::BN254>(curve::BN254::AffineElement* points,
+                                                     const size_t num_points,
+                                                     curve::BN254::BaseField* scratch_space);
+
+extern template void add_affine_points_with_edge_cases<curve::BN254>(curve::BN254::AffineElement* points,
+                                                                     const size_t num_points,
+                                                                     curve::BN254::BaseField* scratch_space);
+
+extern template void evaluate_addition_chains<curve::BN254>(affine_product_runtime_state<curve::BN254>& state,
+                                                            const size_t max_bucket_bits,
+                                                            bool handle_edge_cases);
+extern template curve::BN254::Element pippenger_internal<curve::BN254>(curve::BN254::AffineElement* points,
+                                                                       curve::BN254::ScalarField* scalars,
+                                                                       const size_t num_initial_points,
+                                                                       pippenger_runtime_state<curve::BN254>& state,
+                                                                       bool handle_edge_cases);
+
+extern template curve::BN254::Element evaluate_pippenger_rounds<curve::BN254>(
+    pippenger_runtime_state<curve::BN254>& state,
+    curve::BN254::AffineElement* points,
+    const size_t num_points,
+    bool handle_edge_cases = false);
+
+extern template curve::BN254::AffineElement* reduce_buckets<curve::BN254>(
+    affine_product_runtime_state<curve::BN254>& state, bool first_round = true, bool handle_edge_cases = false);
+
+extern template curve::BN254::Element pippenger<curve::BN254>(curve::BN254::ScalarField* scalars,
+                                                              curve::BN254::AffineElement* points,
+                                                              const size_t num_points,
+                                                              pippenger_runtime_state<curve::BN254>& state,
+                                                              bool handle_edge_cases = true);
+
+extern template curve::BN254::Element pippenger_unsafe<curve::BN254>(curve::BN254::ScalarField* scalars,
+                                                                     curve::BN254::AffineElement* points,
+                                                                     const size_t num_initial_points,
+                                                                     pippenger_runtime_state<curve::BN254>& state);
+
+extern template curve::BN254::Element pippenger_without_endomorphism_basis_points<curve::BN254>(
+    curve::BN254::ScalarField* scalars,
+    curve::BN254::AffineElement* points,
+    const size_t num_initial_points,
+    pippenger_runtime_state<curve::BN254>& state);
 
 } // namespace scalar_multiplication
-} // namespace grumpkin
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp
index fa2e5f15c6..e2b09d3d7e 100644
--- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp
+++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp
@@ -1,46 +1,36 @@
-#include <chrono>
-#include <fstream>
-#include <vector>
-
 #include "pippenger.hpp"
 #include "scalar_multiplication.hpp"
-#include "barretenberg/common/mem.hpp"
+#include <chrono>
 #include "barretenberg/common/test.hpp"
 #include "barretenberg/srs/io.hpp"
+#include <vector>
+
 #include "barretenberg/numeric/random/engine.hpp"
 
-// paths are relative to cpp/build/
-std::string GRUMPKIN_SRS_PATH = "../srs_db/grumpkin";
+#include "barretenberg/common/mem.hpp"
+
+#define BARRETENBERG_SRS_PATH "../srs_db/ignition"
 
-using namespace grumpkin;
-using namespace grumpkin::scalar_multiplication;
+using namespace barretenberg;
+using namespace barretenberg::scalar_multiplication;
 
 namespace {
 auto& engine = numeric::random::get_debug_engine();
 }
 
-TEST(grumpkin_scalar_multiplication, fake_transcript_io)
-{
-    size_t file_num = 0;
-    std::string transcript_path = io::get_transcript_path(GRUMPKIN_SRS_PATH, file_num);
-
-    std::vector<grumpkin::g1::affine_element> srs(3);
-    grumpkin::io::read_transcript_g1(&srs[0], /*degree=*/3, GRUMPKIN_SRS_PATH);
-    // the SRS is [x^i]_1 where x = 2
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one), srs[0]);
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one + g1::one), srs[1]);
-    EXPECT_EQ(static_cast<g1::affine_element>(g1::one + g1::one + g1::one + g1::one), srs[2]);
-}
+using Curve = curve::BN254;
 
-TEST(grumpkin_scalar_multiplication, reduce_buckets_simple)
+TEST(scalar_multiplication, reduce_buckets_simple)
 {
     constexpr size_t num_points = 128;
-    auto pippenger = Pippenger(GRUMPKIN_SRS_PATH, num_points / 2);
+    g2::affine_element g2_x;
+    io::read_transcript_g2(g2_x, BARRETENBERG_SRS_PATH);
+    auto pippenger = Pippenger<Curve>(BARRETENBERG_SRS_PATH, num_points / 2);
     auto monomials = pippenger.get_point_table();
 
     std::vector<uint64_t> point_schedule(scalar_multiplication::point_table_size(num_points / 2));
     std::array<bool, num_points> bucket_empty_status;
-
+    // 16 buckets, each bucket has one point
     std::array<uint64_t, num_points> transcript;
     std::array<uint64_t, num_points> transcript_points;
     transcript_points[0] = 0x0;
@@ -197,14 +187,14 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets_simple)
     std::array<uint32_t, num_points> bucket_counts;
     std::array<uint32_t, num_points> bit_offsets = { 0 };
 
-    scalar_multiplication::affine_product_runtime_state product_state{
+    scalar_multiplication::affine_product_runtime_state<Curve> product_state{
         &monomials[0],          &point_pairs[0],   &output_buckets[0],
         &scratch_space[0],      &bucket_counts[0], &bit_offsets[0],
         &point_schedule[0],     num_points,        2,
         &bucket_empty_status[0]
     };
 
-    g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true);
+    g1::affine_element* output = scalar_multiplication::reduce_buckets<Curve>(product_state, true);
 
     for (size_t i = 0; i < product_state.num_buckets; ++i) {
         expected[i] = expected[i].normalize();
@@ -213,7 +203,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets_simple)
     }
 }
 
-TEST(grumpkin_scalar_multiplication, reduce_buckets)
+TEST(scalar_multiplication, reduce_buckets)
 {
     constexpr size_t num_initial_points = 1 << 12;
     constexpr size_t num_points = num_initial_points * 2;
@@ -235,10 +225,10 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets)
 
     memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
 
-    // WORKTODO: unify by using 0 g2 elts
-    grumpkin::io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
+    g2::affine_element g2_x;
+    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
 
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials, monomials, num_initial_points);
 
     fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
 
@@ -246,10 +236,10 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets)
         scalars[i] = fr::random_element();
     }
 
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_initial_points);
 
     std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
+    scalar_multiplication::compute_wnaf_states<Curve>(
         state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
     std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
     std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
@@ -276,19 +266,19 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets)
     const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL;
     const size_t num_buckets = last_bucket - first_bucket + 1;
 
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       &state.point_schedule[num_points],
-                                                                       num_points,
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
+    scalar_multiplication::affine_product_runtime_state<Curve> product_state{ monomials,
+                                                                              point_pairs,
+                                                                              scratch_points,
+                                                                              scratch_field,
+                                                                              bucket_counts,
+                                                                              &bit_offsets[0],
+                                                                              &state.point_schedule[num_points],
+                                                                              num_points,
+                                                                              static_cast<uint32_t>(num_buckets),
+                                                                              bucket_empty_status };
 
     start = std::chrono::steady_clock::now();
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
+    // scalar_multiplication::scalar_multiplication_internal<Curve><num_points>(state, monomials);
     end = std::chrono::steady_clock::now();
     diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
     std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
@@ -309,7 +299,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets)
 
     size_t it = 0;
 
-    g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true);
+    g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets<Curve>(product_state, true);
 
     printf("num buckets = %zu \n", num_buckets);
     for (size_t i = 0; i < num_buckets; ++i) {
@@ -334,7 +324,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets)
 }
 
 // This test intermittenly fails.
-TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
+TEST(scalar_multiplication, DISABLED_reduce_buckets_basic)
 {
     constexpr size_t num_initial_points = 1 << 20;
     constexpr size_t num_points = num_initial_points * 2;
@@ -352,7 +342,8 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
     memset((void*)scratch_field, 0x00, num_points * sizeof(fq));
     memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool));
 
-    io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
+    g2::affine_element g2_x;
+    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
 
     fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
 
@@ -362,11 +353,11 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
         fr::__copy(source_scalar, scalars[i]);
     }
 
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_initial_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials, monomials, num_initial_points);
 
     std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
+    scalar_multiplication::compute_wnaf_states<Curve>(
         state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
     std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
     std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
@@ -386,20 +377,20 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
     const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL;
     const size_t num_buckets = last_bucket - first_bucket + 1;
 
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       point_pairs,
-                                                                       scratch_points,
-                                                                       scratch_field,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       (uint32_t)state.round_counts[0],
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
+    scalar_multiplication::affine_product_runtime_state<Curve> product_state{ monomials,
+                                                                              point_pairs,
+                                                                              scratch_points,
+                                                                              scratch_field,
+                                                                              bucket_counts,
+                                                                              &bit_offsets[0],
+                                                                              state.point_schedule,
+                                                                              (uint32_t)state.round_counts[0],
+                                                                              static_cast<uint32_t>(num_buckets),
+                                                                              bucket_empty_status };
 
     start = std::chrono::steady_clock::now();
-    scalar_multiplication::reduce_buckets(product_state, true);
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
+    scalar_multiplication::reduce_buckets<Curve>(product_state, true);
+    // scalar_multiplication::scalar_multiplication_internal<Curve><num_points>(state, monomials);
     end = std::chrono::steady_clock::now();
     diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
     std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
@@ -413,7 +404,7 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic)
     aligned_free(bucket_counts);
 }
 
-TEST(grumpkin_scalar_multiplication, add_affine_points)
+TEST(scalar_multiplication, add_affine_points)
 {
     constexpr size_t num_points = 20;
     g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
@@ -434,7 +425,7 @@ TEST(grumpkin_scalar_multiplication, add_affine_points)
         points_copy[count + 1] = points_copy[count + 1].normalize();
     }
 
-    scalar_multiplication::add_affine_points(points, num_points, scratch_space);
+    scalar_multiplication::add_affine_points<Curve>(points, num_points, scratch_space);
     for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) {
         EXPECT_EQ((points[i].x == points_copy[i].x), true);
         EXPECT_EQ((points[i].y == points_copy[i].y), true);
@@ -445,13 +436,14 @@ TEST(grumpkin_scalar_multiplication, add_affine_points)
     aligned_free(scratch_space);
 }
 
-TEST(grumpkin_scalar_multiplication, construct_addition_chains)
+TEST(scalar_multiplication, construct_addition_chains)
 {
     constexpr size_t num_initial_points = 1 << 20;
     constexpr size_t num_points = num_initial_points * 2;
     g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points)));
 
-    io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH);
+    g2::affine_element g2_x;
+    io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH);
 
     fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points));
 
@@ -461,11 +453,11 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains)
         fr::__copy(source_scalar, scalars[i]);
     }
 
-    scalar_multiplication::pippenger_runtime_state state(num_initial_points);
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_initial_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials, monomials, num_initial_points);
 
     std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-    scalar_multiplication::compute_wnaf_states(
+    scalar_multiplication::compute_wnaf_states<Curve>(
         state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points);
     std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
     std::chrono::milliseconds diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
@@ -485,22 +477,23 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains)
     const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL;
     const size_t num_buckets = last_bucket - first_bucket + 1;
 
-    scalar_multiplication::affine_product_runtime_state product_state{ monomials,
-                                                                       monomials,
-                                                                       monomials,
-                                                                       nullptr,
-                                                                       bucket_counts,
-                                                                       &bit_offsets[0],
-                                                                       state.point_schedule,
-                                                                       num_points,
-                                                                       static_cast<uint32_t>(num_buckets),
-                                                                       bucket_empty_status };
+    scalar_multiplication::affine_product_runtime_state<Curve> product_state{ monomials,
+                                                                              monomials,
+                                                                              monomials,
+                                                                              nullptr,
+                                                                              bucket_counts,
+                                                                              &bit_offsets[0],
+                                                                              state.point_schedule,
+                                                                              static_cast<uint32_t>(
+                                                                                  state.round_counts[0]),
+                                                                              static_cast<uint32_t>(num_buckets),
+                                                                              bucket_empty_status };
 
     start = std::chrono::steady_clock::now();
-    scalar_multiplication::construct_addition_chains(product_state, true);
-    // scalar_multiplication::scalar_multiplication_internal<num_points>(state, monomials);
+    scalar_multiplication::construct_addition_chains<Curve>(product_state, true);
     end = std::chrono::steady_clock::now();
     diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    info("construct addition chains: ", diff.count(), "ms");
     std::cout << "scalar mul: " << diff.count() << "ms" << std::endl;
 
     aligned_free(bucket_empty_status);
@@ -509,7 +502,7 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains)
     aligned_free(bucket_counts);
 }
 
-TEST(grumpkin_scalar_multiplication, endomorphism_split)
+TEST(scalar_multiplication, endomorphism_split)
 {
     fr scalar = fr::random_element();
 
@@ -545,7 +538,7 @@ TEST(grumpkin_scalar_multiplication, endomorphism_split)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, radix_sort)
+TEST(scalar_multiplication, radix_sort)
 {
     // check that our radix sort correctly sorts!
     constexpr size_t target_degree = 1 << 8;
@@ -558,8 +551,8 @@ TEST(grumpkin_scalar_multiplication, radix_sort)
         fr::__copy(source_scalar, scalars[i]);
     }
 
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
-    scalar_multiplication::compute_wnaf_states(
+    scalar_multiplication::pippenger_runtime_state<Curve> state(target_degree);
+    scalar_multiplication::compute_wnaf_states<Curve>(
         state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree);
 
     uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds));
@@ -590,7 +583,7 @@ TEST(grumpkin_scalar_multiplication, radix_sort)
     free(wnaf_copy);
 }
 
-HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs)
+HEAVY_TEST(scalar_multiplication, oversized_inputs)
 {
     // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps.
     // Check that this is done correctly
@@ -598,12 +591,13 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs)
     size_t target_degree = 1200000;
     g1::affine_element* monomials =
         (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree)));
-    io::read_transcript(monomials, transcript_degree, GRUMPKIN_SRS_PATH);
+    g2::affine_element g2_x;
+    io::read_transcript(monomials, g2_x, transcript_degree, BARRETENBERG_SRS_PATH);
 
     memcpy((void*)(monomials + (2 * transcript_degree)),
            (void*)monomials,
            ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element)));
-    scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(monomials, monomials, target_degree);
 
     fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree));
 
@@ -613,17 +607,17 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs)
         accumulator *= source_scalar;
         fr::__copy(accumulator, scalars[i]);
     }
-    scalar_multiplication::pippenger_runtime_state state(target_degree);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(target_degree);
 
-    g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state);
+    g1::element first = scalar_multiplication::pippenger<Curve>(scalars, monomials, target_degree, state);
     first = first.normalize();
 
     for (size_t i = 0; i < target_degree; ++i) {
         scalars[i].self_neg();
     }
-    scalar_multiplication::pippenger_runtime_state state_2(target_degree);
+    scalar_multiplication::pippenger_runtime_state<Curve> state_2(target_degree);
 
-    g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2);
+    g1::element second = scalar_multiplication::pippenger<Curve>(scalars, monomials, target_degree, state_2);
     second = second.normalize();
 
     EXPECT_EQ((first.z == second.z), true);
@@ -635,7 +629,7 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs)
     aligned_free(scalars);
 }
 
-TEST(grumpkin_scalar_multiplication, undersized_inputs)
+TEST(scalar_multiplication, undersized_inputs)
 {
     // we fall back to traditional scalar multiplication algorithm for small input sizes.
     // Check this is done correctly
@@ -658,11 +652,11 @@ TEST(grumpkin_scalar_multiplication, undersized_inputs)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
 
-    scalar_multiplication::pippenger_runtime_state state(num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
 
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -671,7 +665,7 @@ TEST(grumpkin_scalar_multiplication, undersized_inputs)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger)
+TEST(scalar_multiplication, pippenger)
 {
     constexpr size_t num_points = 8192;
 
@@ -692,10 +686,10 @@ TEST(grumpkin_scalar_multiplication, pippenger)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
 
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -704,7 +698,7 @@ TEST(grumpkin_scalar_multiplication, pippenger)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl)
+TEST(scalar_multiplication, pippenger_edge_case_dbl)
 {
     constexpr size_t num_points = 128;
 
@@ -728,9 +722,9 @@ TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl)
     if (!expected.is_point_at_infinity()) {
         expected = expected.normalize();
     }
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -739,7 +733,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_short_inputs)
+TEST(scalar_multiplication, pippenger_short_inputs)
 {
     constexpr size_t num_points = 8192;
 
@@ -780,10 +774,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_short_inputs)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
 
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -792,7 +786,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_short_inputs)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_unsafe)
+TEST(scalar_multiplication, pippenger_unsafe)
 {
     constexpr size_t num_points = 8192;
 
@@ -812,10 +806,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
 
-    scalar_multiplication::pippenger_runtime_state state(num_points);
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
+    g1::element result = scalar_multiplication::pippenger_unsafe<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -824,7 +818,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs)
+TEST(scalar_multiplication, pippenger_unsafe_short_inputs)
 {
     constexpr size_t num_points = 8192;
 
@@ -866,10 +860,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
 
-    g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state);
+    g1::element result = scalar_multiplication::pippenger_unsafe<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -878,7 +872,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_one)
+TEST(scalar_multiplication, pippenger_one)
 {
     size_t num_points = 1;
 
@@ -899,10 +893,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_one)
         expected += temp;
     }
     expected = expected.normalize();
-    scalar_multiplication::generate_pippenger_point_table(points, points, num_points);
-    scalar_multiplication::pippenger_runtime_state state(num_points);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, num_points);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(num_points);
 
-    g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, num_points, state);
     result = result.normalize();
 
     aligned_free(scalars);
@@ -911,14 +905,14 @@ TEST(grumpkin_scalar_multiplication, pippenger_one)
     EXPECT_EQ(result == expected, true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_zero_points)
+TEST(scalar_multiplication, pippenger_zero_points)
 {
     fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
 
     g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1);
 
-    scalar_multiplication::pippenger_runtime_state state(0);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(0);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, 0, state);
 
     aligned_free(scalars);
     aligned_free(points);
@@ -926,7 +920,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_zero_points)
     EXPECT_EQ(result.is_point_at_infinity(), true);
 }
 
-TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero)
+TEST(scalar_multiplication, pippenger_mul_by_zero)
 {
     fr* scalars = (fr*)aligned_alloc(32, sizeof(fr));
 
@@ -934,10 +928,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero)
 
     scalars[0] = fr::zero();
     points[0] = g1::affine_one;
-    scalar_multiplication::generate_pippenger_point_table(points, points, 1);
+    scalar_multiplication::generate_pippenger_point_table<Curve>(points, points, 1);
 
-    scalar_multiplication::pippenger_runtime_state state(1);
-    g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state);
+    scalar_multiplication::pippenger_runtime_state<Curve> state(1);
+    g1::element result = scalar_multiplication::pippenger<Curve>(scalars, points, 1, state);
 
     aligned_free(scalars);
     aligned_free(points);
diff --git a/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp b/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
index 5436fcb8f6..070222911c 100644
--- a/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
+++ b/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp
@@ -153,7 +153,8 @@ class SECP256K1 {
   public:
     using ScalarField = secp256k1::fr;
     using BaseField = secp256k1::fq;
-    using ProjectiveElement = typename secp256k1::g1::element;
-    using AffineElement = typename secp256k1::g1::affine_element;
+    using Group = secp256k1::g1;
+    using Element = typename Group::element;
+    using AffineElement = typename Group::affine_element;
 };
 } // namespace curve
\ No newline at end of file
diff --git a/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp b/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp
index 4a9e0b7c90..2d04e47c90 100644
--- a/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp
+++ b/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp
@@ -140,7 +140,8 @@ class SECP256R1 {
   public:
     using ScalarField = secp256r1::fr;
     using BaseField = secp256r1::fq;
-    using ProjectiveElement = typename secp256r1::g1::element;
-    using AffineElement = typename secp256r1::g1::affine_element;
+    using Group = secp256r1::g1;
+    using Element = typename Group::element;
+    using AffineElement = typename Group::affine_element;
 };
 } // namespace curve
\ No newline at end of file
diff --git a/cpp/src/barretenberg/honk/pcs/commitment_key.hpp b/cpp/src/barretenberg/honk/pcs/commitment_key.hpp
index 5d4e32c80d..2f95c33f2c 100644
--- a/cpp/src/barretenberg/honk/pcs/commitment_key.hpp
+++ b/cpp/src/barretenberg/honk/pcs/commitment_key.hpp
@@ -8,7 +8,7 @@
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
 #include "barretenberg/srs/reference_string/file_reference_string.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/ecc/curves/bn254/pairing.hpp"
 #include "barretenberg/numeric/bitop/pow.hpp"
 
@@ -61,12 +61,12 @@ class CommitmentKey {
     {
         const size_t degree = polynomial.size();
         ASSERT(degree <= srs.get_monomial_size());
-        return barretenberg::scalar_multiplication::pippenger_unsafe(
+        return barretenberg::scalar_multiplication::pippenger_unsafe<curve::BN254>(
             const_cast<Fr*>(polynomial.data()), srs.get_monomial_points(), degree, pippenger_runtime_state);
     };
 
   private:
-    barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state;
+    barretenberg::scalar_multiplication::pippenger_runtime_state<curve::BN254> pippenger_runtime_state;
     proof_system::FileReferenceString srs;
 };
 
@@ -239,11 +239,11 @@ class CommitmentKey {
     {
         const size_t degree = polynomial.size();
         ASSERT(degree <= srs.get_monomial_size());
-        return barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points(
+        return barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<curve::BN254>(
             const_cast<Fr*>(polynomial.data()), srs.get_monomial_points(), degree, pippenger_runtime_state);
     };
 
-    barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state;
+    barretenberg::scalar_multiplication::pippenger_runtime_state<curve::BN254> pippenger_runtime_state;
     proof_system::FileReferenceString srs;
 };
 
@@ -268,7 +268,7 @@ class VerificationKey {
         , srs(num_points, std::string(path))
     {}
 
-    barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state;
+    barretenberg::scalar_multiplication::pippenger_runtime_state<curve::BN254> pippenger_runtime_state;
     proof_system::FileReferenceString srs;
 };
 
diff --git a/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp b/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp
index a7eaf54f91..c571cb5d89 100644
--- a/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp
+++ b/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp
@@ -2,7 +2,7 @@
 #include <cstddef>
 #include <numeric>
 #include <string>
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/honk/pcs/commitment_key.hpp"
 #include "barretenberg/stdlib/primitives/curves/bn254.hpp"
 
@@ -88,13 +88,15 @@ template <typename Params> class InnerProductArgument {
                 inner_prod_R += a_vec[round_size + j] * b_vec[j];
             }
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
-            L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points(
-                &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
+            L_elements[i] =
+                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<curve::BN254>(
+                    &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
             L_elements[i] += aux_generator * inner_prod_L;
 
             // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
-            R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points(
-                &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
+            R_elements[i] =
+                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<curve::BN254>(
+                    &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
             R_elements[i] += aux_generator * inner_prod_R;
 
             std::string index = std::to_string(i);
@@ -178,8 +180,9 @@ template <typename Params> class InnerProductArgument {
             msm_scalars[2 * i] = round_challenges[i].sqr();
             msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
         }
-        Commitment LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points(
-            &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
+        Commitment LR_sums =
+            barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<curve::BN254>(
+                &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
         Commitment C_zero = C_prime + LR_sums;
 
         /**
@@ -218,7 +221,7 @@ template <typename Params> class InnerProductArgument {
         for (size_t i = 0; i < poly_degree; i++) {
             G_vec_local[i] = srs_elements[i];
         }
-        auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points(
+        auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<curve::BN254>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
 
         auto a_zero = transcript.template receive_from_prover<Fr>("IPA:a_0");
diff --git a/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp b/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp
index 628a969d26..8b9de4e124 100644
--- a/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp
+++ b/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp
@@ -2,10 +2,10 @@
 #include "barretenberg/honk/transcript/transcript.hpp"
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/honk/flavor/standard.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/honk/utils/power_polynomial.hpp"
 
-#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-variable" // TODO(Cody): this needs to go.
 
 using namespace barretenberg;
 using namespace proof_system::honk::sumcheck;
diff --git a/cpp/src/barretenberg/honk/proof_system/verifier.cpp b/cpp/src/barretenberg/honk/proof_system/verifier.cpp
index 3506f15aaa..259c9d13db 100644
--- a/cpp/src/barretenberg/honk/proof_system/verifier.cpp
+++ b/cpp/src/barretenberg/honk/proof_system/verifier.cpp
@@ -2,7 +2,7 @@
 #include "barretenberg/honk/transcript/transcript.hpp"
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/honk/flavor/standard.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/honk/utils/power_polynomial.hpp"
 
 using namespace barretenberg;
diff --git a/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp b/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp
index 5ee4b90fad..ba8e78c037 100644
--- a/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp
+++ b/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp
@@ -3,6 +3,7 @@
 
 #include "c_bind.h"
 #include "join_split.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" // WORKTODO: needed?
 #include "compute_signing_data.hpp"
 #include "../mock/mock_circuit.hpp"
 #include "barretenberg/common/streams.hpp"
@@ -59,7 +60,7 @@ WASM_EXPORT uint32_t join_split__get_new_proving_key_data(uint8_t** output)
 WASM_EXPORT void join_split__init_verification_key(void* pippenger, uint8_t const* g2x)
 {
     auto crs_factory = std::make_unique<proof_system::PippengerReferenceStringFactory>(
-        reinterpret_cast<scalar_multiplication::Pippenger*>(pippenger), g2x);
+        reinterpret_cast<scalar_multiplication::Pippenger<curve::BN254>*>(pippenger), g2x);
     init_verification_key(std::move(crs_factory));
 }
 
diff --git a/cpp/src/barretenberg/plonk/composer/composer_base.cpp b/cpp/src/barretenberg/plonk/composer/composer_base.cpp
index 46e404a57e..e3698dc434 100644
--- a/cpp/src/barretenberg/plonk/composer/composer_base.cpp
+++ b/cpp/src/barretenberg/plonk/composer/composer_base.cpp
@@ -330,10 +330,10 @@ std::shared_ptr<verification_key> ComposerBase::compute_verification_key_base(
 
             // Commit to the constraint selector polynomial and insert the commitment in the verification key.
             auto selector_poly_commitment = g1::affine_element(
-                scalar_multiplication::pippenger(selector_poly_coefficients,
-                                                 proving_key->reference_string->get_monomial_points(),
-                                                 proving_key->circuit_size,
-                                                 proving_key->pippenger_runtime_state));
+                scalar_multiplication::pippenger<curve::BN254>(selector_poly_coefficients,
+                                                               proving_key->reference_string->get_monomial_points(),
+                                                               proving_key->circuit_size,
+                                                               proving_key->pippenger_runtime_state));
 
             circuit_verification_key->commitments.insert({ selector_commitment_label, selector_poly_commitment });
         }
diff --git a/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp b/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp
index c367d9ccd1..5b8ee42d10 100644
--- a/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp
+++ b/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp
@@ -1,6 +1,6 @@
 #include "turbo_plonk_composer_helper.hpp"
 #include "barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp"
 #include "barretenberg/plonk/proof_system/widgets/transition_widgets/turbo_arithmetic_widget.hpp"
diff --git a/cpp/src/barretenberg/plonk/composer/standard_composer.cpp b/cpp/src/barretenberg/plonk/composer/standard_composer.cpp
index 1158590acf..dc04716864 100644
--- a/cpp/src/barretenberg/plonk/composer/standard_composer.cpp
+++ b/cpp/src/barretenberg/plonk/composer/standard_composer.cpp
@@ -1,6 +1,6 @@
 #include "standard_composer.hpp"
 #include "barretenberg/plonk/proof_system/types/prover_settings.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/plonk/proof_system/widgets/transition_widgets/arithmetic_widget.hpp"
 #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp"
diff --git a/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp b/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp
index f3339cf147..20f686e4d3 100644
--- a/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp
+++ b/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp
@@ -1,5 +1,5 @@
 #include "turbo_composer.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp"
 #include "barretenberg/plonk/proof_system/widgets/transition_widgets/turbo_arithmetic_widget.hpp"
diff --git a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
index ce0f4eac7b..fab8b9c575 100644
--- a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
+++ b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
@@ -1,6 +1,6 @@
 #include "ultra_composer.hpp"
 
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include <algorithm>
 #include <optional>
diff --git a/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp b/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp
index b7730585e1..32a14c0ac6 100644
--- a/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp
+++ b/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp
@@ -3,7 +3,7 @@
 #include "barretenberg/plonk/proof_system/types/prover_settings.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
 #include <chrono>
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/iterate_over_domain.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 
diff --git a/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp b/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp
index f3198a5cbb..f30b0bbf9f 100644
--- a/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp
+++ b/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp
@@ -1,14 +1,15 @@
 #pragma once
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp"
 #include <map>
+#include <unordered_map>
+
+#include "barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp"
 #include "barretenberg/polynomials/evaluation_domain.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
-
 #include "barretenberg/proof_system/polynomial_store/polynomial_store.hpp"
 #include "barretenberg/srs/reference_string/reference_string.hpp"
 #include "barretenberg/plonk/proof_system/constants.hpp"
 #include "barretenberg/plonk/proof_system/types/polynomial_manifest.hpp"
-#include <unordered_map>
+#include "barretenberg/ecc/curves/bn254/bn254.hpp"
 
 namespace proof_system::plonk {
 
@@ -62,7 +63,7 @@ struct proving_key {
 
     barretenberg::polynomial quotient_polynomial_parts[plonk::NUM_QUOTIENT_PARTS];
 
-    barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state;
+    barretenberg::scalar_multiplication::pippenger_runtime_state<curve::BN254> pippenger_runtime_state;
 
     PolynomialManifest polynomial_manifest;
 
diff --git a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp
index bee400fd36..6f37c631ac 100644
--- a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp
+++ b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp
@@ -5,7 +5,7 @@
 #include "../utils/kate_verification.hpp"
 #include "barretenberg/ecc/curves/bn254/fq12.hpp"
 #include "barretenberg/ecc/curves/bn254/pairing.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 
 using namespace barretenberg;
@@ -176,12 +176,13 @@ template <typename program_settings> bool VerifierBase<program_settings>::verify
 
     size_t num_elements = elements.size();
     elements.resize(num_elements * 2);
-    barretenberg::scalar_multiplication::generate_pippenger_point_table(&elements[0], &elements[0], num_elements);
-    scalar_multiplication::pippenger_runtime_state state(num_elements);
+    barretenberg::scalar_multiplication::generate_pippenger_point_table<curve::BN254>(
+        &elements[0], &elements[0], num_elements);
+    scalar_multiplication::pippenger_runtime_state<curve::BN254> state(num_elements);
 
     g1::element P[2];
 
-    P[0] = barretenberg::scalar_multiplication::pippenger(&scalars[0], &elements[0], num_elements, state);
+    P[0] = barretenberg::scalar_multiplication::pippenger<curve::BN254>(&scalars[0], &elements[0], num_elements, state);
     P[1] = -(g1::element(PI_Z_OMEGA) * separator_challenge + PI_Z);
 
     if (key->contains_recursive_proof) {
diff --git a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp
index 701a465189..eccd0ed6fa 100644
--- a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp
+++ b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp
@@ -5,7 +5,7 @@
 #include "../../../transcript/transcript.hpp"
 #include "barretenberg/plonk/composer/standard_composer.hpp"
 #include "verifier.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include <gtest/gtest.h>
 #include "barretenberg/srs/reference_string/file_reference_string.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
@@ -29,15 +29,15 @@ plonk::Verifier generate_verifier(std::shared_ptr<proving_key> circuit_proving_k
     poly_coefficients[7] = circuit_proving_key->polynomial_store.get("sigma_3").get_coefficients();
 
     std::vector<barretenberg::g1::affine_element> commitments;
-    scalar_multiplication::pippenger_runtime_state state(circuit_proving_key->circuit_size);
+    scalar_multiplication::pippenger_runtime_state<curve::BN254> state(circuit_proving_key->circuit_size);
     commitments.resize(8);
 
     for (size_t i = 0; i < 8; ++i) {
         commitments[i] = g1::affine_element(
-            scalar_multiplication::pippenger(poly_coefficients[i],
-                                             circuit_proving_key->reference_string->get_monomial_points(),
-                                             circuit_proving_key->circuit_size,
-                                             state));
+            scalar_multiplication::pippenger<curve::BN254>(poly_coefficients[i],
+                                                           circuit_proving_key->reference_string->get_monomial_points(),
+                                                           circuit_proving_key->circuit_size,
+                                                           state));
     }
 
     auto crs = std::make_shared<VerifierFileReferenceString>("../srs_db/ignition");
diff --git a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp
index f59863bc5e..ec2b12d46b 100644
--- a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp
+++ b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp
@@ -1,6 +1,6 @@
 #pragma once
 #include "barretenberg/common/mem.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/plonk/proof_system/proving_key/proving_key.hpp"
 #include "barretenberg/plonk/proof_system/public_inputs/public_inputs.hpp"
 #include "barretenberg/transcript/transcript.hpp"
@@ -229,7 +229,7 @@ void ProverPermutationWidget<program_width, idpolys, num_roots_cut_out_of_vanish
         // Naive way of computing these coefficients would result in n inversions, which is pretty expensive.
         // Instead we use Montgomery's trick for batch inversion.
         // Montgomery's trick documentation:
-        // ./src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp/L286
+        // ./src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp/L286
 #ifndef NO_MULTITHREADING
 #pragma omp for
 #endif
diff --git a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/plookup_widget_impl.hpp b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/plookup_widget_impl.hpp
index 2a9161c443..33095d9a04 100644
--- a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/plookup_widget_impl.hpp
+++ b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/plookup_widget_impl.hpp
@@ -3,7 +3,7 @@
 #include "barretenberg/plonk/proof_system/proving_key/proving_key.hpp"
 #include "barretenberg/transcript/transcript.hpp"
 #include "barretenberg/polynomials/iterate_over_domain.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 #include "barretenberg/common/mem.hpp"
 
diff --git a/cpp/src/barretenberg/polynomials/polynomials.bench.cpp b/cpp/src/barretenberg/polynomials/polynomials.bench.cpp
index 93f70d79f3..411c30fd9a 100644
--- a/cpp/src/barretenberg/polynomials/polynomials.bench.cpp
+++ b/cpp/src/barretenberg/polynomials/polynomials.bench.cpp
@@ -5,7 +5,7 @@
 #include "barretenberg/ecc/curves/bn254/g1.hpp"
 #include "barretenberg/ecc/curves/bn254/g2.hpp"
 #include "barretenberg/ecc/curves/bn254/pairing.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/ecc/groups/wnaf.hpp"
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
@@ -122,10 +122,11 @@ void pippenger_bench(State& state) noexcept
     for (auto _ : state) {
         const size_t num_points = static_cast<size_t>(state.range(0));
         state.PauseTiming();
-        scalar_multiplication::pippenger_runtime_state run_state(num_points);
+        scalar_multiplication::pippenger_runtime_state<curve::BN254> run_state(num_points);
         state.ResumeTiming();
         // uint64_t before = rdtsc();
-        scalar_multiplication::pippenger(&globals.scalars[0], &globals.monomials[0], num_points, run_state);
+        scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[0], &globals.monomials[0], num_points, run_state);
         // uint64_t after = rdtsc();
         // count += (after - before);
         // ++i;
@@ -143,7 +144,7 @@ void unsafe_pippenger_bench(State& state) noexcept
     uint64_t i = 0;
     for (auto _ : state) {
         state.PauseTiming();
-        scalar_multiplication::pippenger_runtime_state run_state(num_points);
+        scalar_multiplication::pippenger_runtime_state<curve::BN254> run_state(num_points);
         state.ResumeTiming();
 
         uint64_t before = rdtsc();
@@ -164,28 +165,28 @@ void new_plonk_scalar_multiplications_bench(State& state) noexcept
     uint64_t k = 0;
     for (auto _ : state) {
         state.PauseTiming();
-        scalar_multiplication::pippenger_runtime_state run_state(MAX_GATES);
+        scalar_multiplication::pippenger_runtime_state<curve::BN254> run_state(MAX_GATES);
         state.ResumeTiming();
 
         uint64_t before = rdtsc();
-        g1::element a =
-            scalar_multiplication::pippenger(&globals.scalars[0], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element b =
-            scalar_multiplication::pippenger(&globals.scalars[1], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element c =
-            scalar_multiplication::pippenger(&globals.scalars[2], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element d =
-            scalar_multiplication::pippenger(&globals.scalars[3], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element e =
-            scalar_multiplication::pippenger(&globals.scalars[4], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element f =
-            scalar_multiplication::pippenger(&globals.scalars[5], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element g =
-            scalar_multiplication::pippenger(&globals.scalars[6], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element h =
-            scalar_multiplication::pippenger(&globals.scalars[7], &globals.monomials[0], MAX_GATES, run_state);
-        g1::element i =
-            scalar_multiplication::pippenger(&globals.scalars[8], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element a = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[0], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element b = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[1], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element c = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[2], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element d = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[3], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element e = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[4], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element f = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[5], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element g = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[6], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element h = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[7], &globals.monomials[0], MAX_GATES, run_state);
+        g1::element i = scalar_multiplication::pippenger<curve::BN254>(
+            &globals.scalars[8], &globals.monomials[0], MAX_GATES, run_state);
         uint64_t after = rdtsc();
         count += (after - before);
         ++k;
diff --git a/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp b/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp
index 8bc4b461e2..c8a852015e 100644
--- a/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp
+++ b/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp
@@ -1,5 +1,5 @@
 #include "turbo_circuit_constructor.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed?
 #include "barretenberg/numeric/bitop/get_msb.hpp"
 
 using namespace barretenberg;
diff --git a/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp b/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp
index 038716f4f8..3214799518 100644
--- a/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp
+++ b/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp
@@ -1,6 +1,6 @@
 #include "work_queue.hpp"
 
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
 
 namespace proof_system::plonk {
@@ -206,8 +206,8 @@ void work_queue::process_queue()
             barretenberg::g1::affine_element* srs_points = key->reference_string->get_monomial_points();
 
             // Run pippenger multi-scalar multiplication.
-            auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state(msm_size);
-            barretenberg::g1::affine_element result(barretenberg::scalar_multiplication::pippenger_unsafe(
+            auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state<curve::BN254>(msm_size);
+            barretenberg::g1::affine_element result(barretenberg::scalar_multiplication::pippenger_unsafe<curve::BN254>(
                 item.mul_scalars, srs_points, msm_size, runtime_state));
 
             transcript->add_element(item.tag, result.to_buffer());
diff --git a/cpp/src/barretenberg/srs/io.cpp b/cpp/src/barretenberg/srs/io.cpp
index 6e77540690..3b51adaf67 100644
--- a/cpp/src/barretenberg/srs/io.cpp
+++ b/cpp/src/barretenberg/srs/io.cpp
@@ -35,9 +35,9 @@ void read_manifest(std::string const& filename, Manifest& manifest)
     manifest.start_from = ntohl(manifest.start_from);
 }
 
-void byteswap(g1::affine_element* elements, size_t elements_size)
+template <typename Curve> void byteswap(typename Curve::AffineElement* elements, size_t elements_size)
 {
-    constexpr size_t bytes_per_element = sizeof(g1::affine_element);
+    constexpr size_t bytes_per_element = sizeof(typename Curve::AffineElement);
     size_t num_elements = elements_size / bytes_per_element;
 
     if (is_little_endian()) {
@@ -59,7 +59,7 @@ void byteswap(g1::affine_element* elements, size_t elements_size)
 void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size)
 {
     memcpy((void*)elements, (void*)buffer, buffer_size);
-    byteswap(elements, buffer_size);
+    byteswap<curve::BN254>(elements, buffer_size);
 }
 
 void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size)
@@ -138,7 +138,8 @@ bool is_file_exist(std::string const& fileName)
     return infile.good();
 }
 
-void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir)
+template <typename Curve>
+void read_transcript_g1(typename Curve::AffineElement* monomials, size_t degree, std::string const& dir)
 {
     size_t num = 0;
     size_t num_read = 0;
@@ -158,7 +159,7 @@ void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::strin
         // We must pass the size actually read to the second call, not the desired
         // g1_buffer_size as the file may have been smaller than this.
         read_file_into_buffer(buffer, size, path, offset, g1_buffer_size);
-        byteswap(&monomials[num_read], size);
+        byteswap<Curve>(&monomials[num_read], size);
 
         num_read += num_to_read;
         path = get_transcript_path(dir, ++num);
@@ -215,7 +216,7 @@ void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir)
 
 void read_transcript(g1::affine_element* monomials, g2::affine_element& g2_x, size_t degree, std::string const& path)
 {
-    read_transcript_g1(monomials, degree, path);
+    read_transcript_g1<curve::BN254>(monomials, degree, path);
     read_transcript_g2(g2_x, path);
 }
 
@@ -363,31 +364,31 @@ void read_manifest(std::string const& filename, Manifest& manifest)
     manifest.start_from = ntohl(manifest.start_from);
 }
 
-void byteswap(g1::affine_element* elements, size_t elements_size)
-{
-    constexpr size_t bytes_per_element = sizeof(g1::affine_element);
-    size_t num_elements = elements_size / bytes_per_element;
+// void byteswap(g1::affine_element* elements, size_t elements_size)
+// {
+//     constexpr size_t bytes_per_element = sizeof(g1::affine_element);
+//     size_t num_elements = elements_size / bytes_per_element;
 
-    if (is_little_endian()) {
-        for (size_t i = 0; i < num_elements; ++i) {
-            elements[i].x.data[0] = __builtin_bswap64(elements[i].x.data[0]);
-            elements[i].x.data[1] = __builtin_bswap64(elements[i].x.data[1]);
-            elements[i].x.data[2] = __builtin_bswap64(elements[i].x.data[2]);
-            elements[i].x.data[3] = __builtin_bswap64(elements[i].x.data[3]);
-            elements[i].y.data[0] = __builtin_bswap64(elements[i].y.data[0]);
-            elements[i].y.data[1] = __builtin_bswap64(elements[i].y.data[1]);
-            elements[i].y.data[2] = __builtin_bswap64(elements[i].y.data[2]);
-            elements[i].y.data[3] = __builtin_bswap64(elements[i].y.data[3]);
-            elements[i].x.self_to_montgomery_form();
-            elements[i].y.self_to_montgomery_form();
-        }
-    }
-}
+//     if (is_little_endian()) {
+//         for (size_t i = 0; i < num_elements; ++i) {
+//             elements[i].x.data[0] = __builtin_bswap64(elements[i].x.data[0]);
+//             elements[i].x.data[1] = __builtin_bswap64(elements[i].x.data[1]);
+//             elements[i].x.data[2] = __builtin_bswap64(elements[i].x.data[2]);
+//             elements[i].x.data[3] = __builtin_bswap64(elements[i].x.data[3]);
+//             elements[i].y.data[0] = __builtin_bswap64(elements[i].y.data[0]);
+//             elements[i].y.data[1] = __builtin_bswap64(elements[i].y.data[1]);
+//             elements[i].y.data[2] = __builtin_bswap64(elements[i].y.data[2]);
+//             elements[i].y.data[3] = __builtin_bswap64(elements[i].y.data[3]);
+//             elements[i].x.self_to_montgomery_form();
+//             elements[i].y.self_to_montgomery_form();
+//         }
+//     }
+// }
 
 void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size)
 {
     memcpy((void*)elements, (void*)buffer, buffer_size);
-    byteswap(elements, buffer_size);
+    barretenberg::io::byteswap<curve::Grumpkin>(elements, buffer_size);
 }
 
 // void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size)
@@ -486,7 +487,7 @@ void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::strin
         // We must pass the size actually read to the second call, not the desired
         // g1_buffer_size as the file may have been smaller than this.
         read_file_into_buffer(buffer, size, path, offset, g1_buffer_size);
-        byteswap(&monomials[num_read], size);
+        barretenberg::io::byteswap<curve::Grumpkin>(&monomials[num_read], size);
 
         num_read += num_to_read;
         path = get_transcript_path(dir, ++num);
@@ -671,3 +672,17 @@ void write_transcript(g1::affine_element const* g1_x,
 
 } // namespace io
 } // namespace grumpkin
+
+// WORKTODO: hack
+namespace barretenberg {
+namespace io {
+template void read_transcript_g1<curve::BN254>(barretenberg::g1::affine_element* monomials,
+                                               size_t degree,
+                                               std::string const& dir);
+template void byteswap<curve::BN254>(barretenberg::g1::affine_element* elements, size_t buffer_size);
+template void read_transcript_g1<curve::Grumpkin>(grumpkin::g1::affine_element* monomials,
+                                                  size_t degree,
+                                                  std::string const& dir);
+template void byteswap<curve::Grumpkin>(grumpkin::g1::affine_element* elements, size_t buffer_size);
+} // namespace io
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/srs/io.hpp b/cpp/src/barretenberg/srs/io.hpp
index f08a53f9c4..5af499ed09 100644
--- a/cpp/src/barretenberg/srs/io.hpp
+++ b/cpp/src/barretenberg/srs/io.hpp
@@ -1,6 +1,5 @@
 #pragma once
-#include "../ecc/curves/bn254/g1.hpp"
-#include "../ecc/curves/bn254/g2.hpp"
+#include "../ecc/curves/bn254/bn254.hpp"
 #include "../ecc/curves/grumpkin/grumpkin.hpp"
 #include <cstdint>
 #include <string>
@@ -18,14 +17,16 @@ struct Manifest {
     uint32_t start_from;
 };
 
-void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir);
+template <typename Curve>
+void read_transcript_g1(typename Curve::AffineElement* monomials, size_t degree, std::string const& dir);
 
 void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir);
 
 void read_transcript(g1::affine_element* monomials, g2::affine_element& g2_x, size_t degree, std::string const& path);
 
 void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size);
-void byteswap(g1::affine_element* elements, size_t buffer_size);
+
+template <typename Curve> void byteswap(typename Curve::AffineElement* elements, size_t buffer_size);
 
 void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size);
 void byteswap(g2::affine_element* elements, size_t buffer_size);
@@ -61,7 +62,7 @@ struct Manifest {
 
 std::string get_transcript_path(std::string const& dir, size_t num);
 
-void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir);
+// void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir);
 
 // void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir);
 
@@ -88,3 +89,17 @@ void write_transcript(g1::affine_element const* g1_x,
 
 } // namespace io
 } // namespace grumpkin
+
+// WORKTODO: hack
+namespace barretenberg {
+namespace io {
+extern template void read_transcript_g1<curve::BN254>(barretenberg::g1::affine_element* monomials,
+                                                      size_t degree,
+                                                      std::string const& dir);
+extern template void byteswap<curve::BN254>(barretenberg::g1::affine_element* elements, size_t buffer_size);
+extern template void read_transcript_g1<curve::Grumpkin>(grumpkin::g1::affine_element* monomials,
+                                                         size_t degree,
+                                                         std::string const& dir);
+extern template void byteswap<curve::Grumpkin>(grumpkin::g1::affine_element* elements, size_t buffer_size);
+} // namespace io
+} // namespace barretenberg
diff --git a/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp
index cceb6eef82..a53784be2b 100644
--- a/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp
+++ b/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp
@@ -11,7 +11,7 @@
 
 #include "barretenberg/ecc/curves/bn254/g1.hpp"
 #include "barretenberg/ecc/curves/bn254/g2.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp"
 
 #include "barretenberg/env/crs.hpp"
 
@@ -32,7 +32,7 @@ class EnvReferenceString : public ProverReferenceString {
 
   private:
     size_t num_points;
-    scalar_multiplication::Pippenger pippenger_;
+    scalar_multiplication::Pippenger<curve::BN254> pippenger_;
 };
 
 class EnvReferenceStringFactory : public ReferenceStringFactory {
diff --git a/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp
index ef6bebfc0a..37622675e9 100644
--- a/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp
+++ b/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp
@@ -6,7 +6,7 @@
 
 #include "barretenberg/ecc/curves/bn254/g1.hpp"
 #include "barretenberg/ecc/curves/bn254/g2.hpp"
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp"
 
 #include <utility>
 #include <cstddef>
@@ -46,7 +46,7 @@ class FileReferenceString : public ProverReferenceString {
 
   private:
     size_t num_points;
-    scalar_multiplication::Pippenger pippenger_;
+    scalar_multiplication::Pippenger<curve::BN254> pippenger_;
 };
 
 class FileReferenceStringFactory : public ReferenceStringFactory {
diff --git a/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp
index 608446fa77..50cbacbcac 100644
--- a/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp
+++ b/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp
@@ -5,7 +5,7 @@
 
 #include "reference_string.hpp"
 
-#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp"
+#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" // WORKTODO: needed?
 
 namespace barretenberg::pairing {
 struct miller_lines;
diff --git a/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp
index 53c6867018..6bea00b53f 100644
--- a/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp
+++ b/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp
@@ -15,7 +15,7 @@ using namespace barretenberg;
 
 class PippengerReferenceString : public ProverReferenceString {
   public:
-    PippengerReferenceString(scalar_multiplication::Pippenger* pippenger)
+    PippengerReferenceString(scalar_multiplication::Pippenger<curve::BN254>* pippenger)
         : pippenger_(pippenger)
     {}
 
@@ -23,12 +23,12 @@ class PippengerReferenceString : public ProverReferenceString {
     g1::affine_element* get_monomial_points() override { return pippenger_->get_point_table(); }
 
   private:
-    scalar_multiplication::Pippenger* pippenger_;
+    scalar_multiplication::Pippenger<curve::BN254>* pippenger_;
 };
 
 class PippengerReferenceStringFactory : public ReferenceStringFactory {
   public:
-    PippengerReferenceStringFactory(scalar_multiplication::Pippenger* pippenger, uint8_t const* g2x)
+    PippengerReferenceStringFactory(scalar_multiplication::Pippenger<curve::BN254>* pippenger, uint8_t const* g2x)
         : pippenger_(pippenger)
         , g2x_(g2x)
     {}
@@ -47,7 +47,7 @@ class PippengerReferenceStringFactory : public ReferenceStringFactory {
     }
 
   private:
-    scalar_multiplication::Pippenger* pippenger_;
+    scalar_multiplication::Pippenger<curve::BN254>* pippenger_;
     uint8_t const* g2x_;
 };