From 0989ed5732e01c181654c7554c60ef1e06b09090 Mon Sep 17 00:00:00 2001
From: Zachary James Williamson <blorktronics@gmail.com>
Date: Wed, 17 May 2023 19:47:42 +0200
Subject: [PATCH] Zw/noir recursion 2 (#414)

* removed redundant `reduce` operations after negating biggroup elements

simplified hash input structure when hashing transcripts

cached partial non native field multiplications

reverted how native transcript computes hash buffers

pedersen_plookup can be configured to skip the hash_single range check under limited conditions

fixed the range check in pedersen_plookup::hash_single

pedersen_plookup::hash_single now validates the low and high scalar slice values match the  original scalar

bigfield::operator- now correctly uses the UltraPlonk code path if able to

added biggroup::multiple_montgomery_ladder to reduce required field multiplications

added biggroup::quadruple_and_add to reduce required field multiplications

biggroup_nafs now directly calls the Composer range constraint methods to avoid creating redundant arithmetic gates when using the PlookupComposer

biggroup plookup ROM tables now track the maximum size of any field element recovered from the table (i.e. the maximum of the input maximum sizes)

biggroup batch tables prefer to create size-6 lookup tables if doing so reduces the number of individual tables required for a given MSM

recursion::transcript no longer performs redundant range constraints when adding buffer elements
recursion::transcript correctly checks that, when slicing field elements , the slice values are correct over the integers (i.e. slice_sum != original + p)

recursion::verification_key now optimally packs key data into minimum required number of field elements before hashing

recursion::verifier proof and key data is now correctly extracted from the transcript/key instead of being generated directly as witnesses.

cleaned up code + comments

code tidy, added more comments

cleaned up how aggregation object handles public inputs

native verification_key::compress matches circuit output

fixed compile errors + failing tests

compiler error

join_split.test.cpp passing

Note: not changing any upstream .js verification keys. I don't think we need to as bberg is now decoupled from aztec connect

* compiler fix

* more compiler fix

* attempt to fix .js and .sol tests

* revert keccak transcript to original functionality

* added hash_index back into verification_key::compress

fixed composer bug where `decompose_into_default_range` was sometimes not range-constraining last limb

removed commented-out code

added more descriptive comments to PedersenPreimageBuilder

* changed join-split vkey

* temporarily point to branch of aztec that updates aggregation state usage until fix is in aztec master

* revert .aztec-packages-commit

* header brittleness fix

* compiler fix

* compiler fix w. aggregation object

* reverting changes to `assign_object_to_proof_outputs` to preserve backwards-compatibility with a3-packages

* more backwards compatibility fixes

* wip

---------

Co-authored-by: dbanks12 <david@aztecprotocol.com>
Co-authored-by: David Banks <47112877+dbanks12@users.noreply.github.com>
---
 .../convert_buffer_to_field.hpp               |  10 +
 .../crypto/pedersen_commitment/pedersen.cpp   |   8 +-
 .../crypto/pedersen_commitment/pedersen.hpp   |   2 +-
 .../pedersen_commitment/pedersen_lookup.cpp   |  19 +-
 .../pedersen_commitment/pedersen_lookup.hpp   |   4 +-
 .../pedersen_lookup.test.cpp                  |  20 +-
 .../honk/composer/ultra_honk_composer.hpp     |   5 +-
 .../composer/ultra_honk_composer.test.cpp     |   3 +-
 .../barretenberg/honk/proof_system/prover.hpp |   3 +-
 .../honk/proof_system/ultra_prover.hpp        |   3 +-
 .../proofs/join_split/join_split.test.cpp     |   5 +-
 .../splitting_tmp/ultra_plonk_composer.hpp    |   7 +-
 .../ultra_plonk_composer.test.cpp             |   2 +-
 .../plonk/composer/ultra_composer.cpp         | 259 ++++++-----
 .../plonk/composer/ultra_composer.hpp         |  61 +--
 .../plonk/composer/ultra_composer.test.cpp    |   2 +-
 .../verification_key/verification_key.cpp     |  57 +--
 .../ultra_circuit_constructor.cpp             | 219 +++++-----
 .../ultra_circuit_constructor.hpp             | 119 +++--
 .../ultra_circuit_constructor.test.cpp        |   2 +-
 .../circuits/recursive_circuit.hpp            |   2 +-
 .../commitment/pedersen/pedersen_plookup.cpp  |  82 +++-
 .../commitment/pedersen/pedersen_plookup.hpp  |  11 +-
 .../stdlib/hash/pedersen/pedersen_plookup.cpp |  43 +-
 .../stdlib/hash/pedersen/pedersen_plookup.hpp |   2 +-
 .../primitives/bigfield/bigfield_impl.hpp     |  22 +-
 .../stdlib/primitives/biggroup/biggroup.hpp   | 241 ++++------
 .../primitives/biggroup/biggroup.test.cpp     |  30 +-
 .../biggroup/biggroup_batch_mul.hpp           |  11 +-
 .../primitives/biggroup/biggroup_bn254.hpp    |  80 ++--
 .../primitives/biggroup/biggroup_impl.hpp     | 412 ++++++++++--------
 .../primitives/biggroup/biggroup_nafs.hpp     |  43 +-
 .../biggroup/biggroup_secp256k1.hpp           |  13 +-
 .../primitives/biggroup/biggroup_tables.hpp   | 272 +++++++-----
 .../aggregation_state/aggregation_state.hpp   |  38 +-
 .../recursion/transcript/transcript.hpp       | 253 +++++------
 .../verification_key/verification_key.hpp     | 301 +++++++++----
 .../stdlib/recursion/verifier/verifier.hpp    |  58 +--
 .../recursion/verifier/verifier.test.cpp      |  57 ++-
 .../verifier/verifier_turbo.test.cpp          |   2 +-
 .../barretenberg/transcript/transcript.cpp    |  18 +-
 .../keys/RecursiveUltraVerificationKey.sol    | 104 ++---
 42 files changed, 1611 insertions(+), 1294 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
index 10adacf4ce66..9b657280adc4 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
@@ -5,6 +5,16 @@
 namespace crypto {
 namespace pedersen_commitment {
 
+/**
+ * @brief Converts input uint8_t buffers into vector of field elements. Used to hash the Transcript in a SNARK-friendly
+ * manner for recursive circuits.
+ *
+ * `buffer` is an unstructured byte array we want to convert these into field elements
+ * prior to hashing. We do this by splitting buffer into 31-byte chunks.
+ *
+ * @param buffer
+ * @return std::vector<grumpkin::fq>
+ */
 inline std::vector<grumpkin::fq> convert_buffer_to_field(const std::vector<uint8_t>& input)
 {
     const size_t num_bytes = input.size();
diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
index 639dfa2440d5..8e3aa99ef15d 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
@@ -105,16 +105,16 @@ grumpkin::fq compress_native(const std::vector<std::pair<grumpkin::fq, generator
 /**
  * Given an arbitrary length of bytes, convert them to fields and compress the result using the default generators.
  */
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index)
 {
     const auto elements = convert_buffer_to_field(input);
-    grumpkin::fq result_fq = compress_native(elements);
+    grumpkin::fq result_fq = compress_native(elements, hash_index);
     return result_fq;
 }
 
-grumpkin::fq compress_native(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native(const std::vector<uint8_t>& input, const size_t hash_index)
 {
-    return compress_native_buffer_to_field(input);
+    return compress_native_buffer_to_field(input, hash_index);
 }
 
 } // namespace pedersen_commitment
diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
index d7275aa6ac70..0600e13b529c 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
@@ -22,7 +22,7 @@ template <size_t T> grumpkin::fq compress_native(const std::array<grumpkin::fq,
     return commit_native(converted).x;
 }
 
-grumpkin::fq compress_native(const std::vector<uint8_t>& input);
+grumpkin::fq compress_native(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
 grumpkin::fq compress_native(const std::vector<std::pair<grumpkin::fq, generators::generator_index_t>>& input_pairs);
 
diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
index e2333acfa6e5..5e6288e8dfa7 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
@@ -28,11 +28,12 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
     const size_t num_inputs = inputs.size();
 
     grumpkin::fq result = (pedersen_iv_table[iv]).x;
-    for (size_t i = 0; i < num_inputs; i++) {
+    result = hash_pair(result, num_inputs);
+    for (size_t i = 0; i < num_inputs - 1; i++) {
         result = hash_pair(result, inputs[i]);
     }
 
-    return (hash_single(result, false) + hash_single(grumpkin::fq(num_inputs), true));
+    return (hash_single(result, false) + hash_single(inputs[num_inputs - 1], true));
 }
 
 grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& inputs, const std::vector<size_t>& ivs)
@@ -46,7 +47,8 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
     const size_t num_inputs = inputs.size();
 
     grumpkin::fq result = (pedersen_iv_table[0]).x;
-    for (size_t i = 0; i < 2 * num_inputs; i++) {
+    result = hash_pair(result, num_inputs);
+    for (size_t i = 0; i < 2 * num_inputs - 1; i++) {
         if ((i & 1) == 0) {
             grumpkin::fq iv_result = (pedersen_iv_table[ivs[i >> 1]]).x;
             result = hash_pair(result, iv_result);
@@ -54,8 +56,7 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
             result = hash_pair(result, inputs[i >> 1]);
         }
     }
-
-    return (hash_single(result, false) + hash_single(grumpkin::fq(num_inputs), true));
+    return (hash_single(result, false) + hash_single(inputs[num_inputs - 1], true));
 }
 
 grumpkin::g1::element merkle_damgard_tree_compress(const std::vector<grumpkin::fq>& inputs,
@@ -111,16 +112,16 @@ grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const std:
     return commit_native(inputs, hash_indices).x;
 }
 
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index)
 {
     const auto elements = convert_buffer_to_field(input);
-    grumpkin::fq result_fq = compress_native(elements);
+    grumpkin::fq result_fq = compress_native(elements, hash_index);
     return result_fq;
 }
 
-std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input)
+std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input, const size_t hash_index)
 {
-    const auto result_fq = compress_native_buffer_to_field(input);
+    const auto result_fq = compress_native_buffer_to_field(input, hash_index);
     uint256_t result_u256(result_fq);
     const size_t num_bytes = input.size();
 
diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
index 0f99b13fbbdc..b77fac9688da 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
@@ -13,9 +13,9 @@ grumpkin::g1::element merkle_damgard_tree_compress(const std::vector<grumpkin::f
 
 grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const size_t hash_index = 0);
 grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const std::vector<size_t>& hash_indices);
-std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input);
+std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input);
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
 template <size_t T> grumpkin::fq compress_native(const std::array<grumpkin::fq, T>& inputs)
 {
diff --git a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
index 82d0b4f7ed02..a06f5cea5885 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
@@ -157,7 +157,9 @@ TEST(pedersen_lookup, merkle_damgard_compress)
 
     const auto result = crypto::pedersen_commitment::lookup::merkle_damgard_compress(inputs, iv);
 
-    fq intermediate = (grumpkin::g1::affine_one * fr(iv + 1)).x;
+    auto iv_hash = compute_expected((grumpkin::g1::affine_one * fr(iv + 1)).x, 0);
+    auto length = compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2));
+    fq intermediate = affine_element(iv_hash + length).x;
     for (size_t i = 0; i < m; i++) {
         intermediate =
             affine_element(compute_expected(intermediate, 0) +
@@ -165,10 +167,7 @@ TEST(pedersen_lookup, merkle_damgard_compress)
                 .x;
     }
 
-    EXPECT_EQ(affine_element(result).x,
-              affine_element(compute_expected(intermediate, 0) +
-                             compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2)))
-                  .x);
+    EXPECT_EQ(affine_element(result).x, intermediate);
 }
 
 TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
@@ -188,7 +187,11 @@ TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
     const auto result = crypto::pedersen_commitment::lookup::merkle_damgard_compress(inputs, ivs);
 
     const size_t initial_iv = 0;
-    fq intermediate = (grumpkin::g1::affine_one * fr(initial_iv + 1)).x;
+    auto iv_hash = compute_expected((grumpkin::g1::affine_one * fr(initial_iv + 1)).x, 0);
+
+    auto length = compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2));
+    fq intermediate = affine_element(iv_hash + length).x;
+
     for (size_t i = 0; i < 2 * m; i++) {
         if ((i & 1) == 0) {
             const auto iv = (grumpkin::g1::affine_one * fr(ivs[i >> 1] + 1)).x;
@@ -204,10 +207,7 @@ TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
         }
     }
 
-    EXPECT_EQ(affine_element(result).x,
-              affine_element(compute_expected(intermediate, 0) +
-                             compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2)))
-                  .x);
+    EXPECT_EQ(affine_element(result).x, intermediate);
 }
 
 TEST(pedersen_lookup, merkle_damgard_tree_compress)
diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp b/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
index e17087aa8871..1073ec7825ca 100644
--- a/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
@@ -371,11 +371,12 @@ class UltraHonkComposer {
     };
     // std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
     //     const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const UltraCircuitConstructor::non_native_field_witnesses& input,
         const bool range_constrain_quotient_and_remainder = true)
     {
-        return circuit_constructor.queue_non_native_field_multiplication(input, range_constrain_quotient_and_remainder);
+        return circuit_constructor.evaluate_non_native_field_multiplication(input,
+                                                                            range_constrain_quotient_and_remainder);
     };
     // std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses&
     // input); typedef std::pair<uint32_t, barretenberg::fr> scaled_witness; typedef std::tuple<scaled_witness,
diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp b/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
index 3653be64d1f8..09b3373e7f2c 100644
--- a/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
@@ -649,6 +649,7 @@ TEST(UltraHonkComposer, non_native_field_multiplication)
 
     fq a = fq::random_element();
     fq b = fq::random_element();
+
     uint256_t modulus = fq::modulus;
 
     uint1024_t a_big = uint512_t(uint256_t(a));
@@ -692,7 +693,7 @@ TEST(UltraHonkComposer, non_native_field_multiplication)
     proof_system::UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     prove_and_verify(composer, /*expected_result=*/true);
diff --git a/barretenberg/cpp/src/barretenberg/honk/proof_system/prover.hpp b/barretenberg/cpp/src/barretenberg/honk/proof_system/prover.hpp
index 7c7d1d0bb8ce..477ad349c5a0 100644
--- a/barretenberg/cpp/src/barretenberg/honk/proof_system/prover.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/proof_system/prover.hpp
@@ -15,7 +15,8 @@ namespace proof_system::honk {
 
 // We won't compile this class with honk::flavor::Ultra, but we will like want to compile it (at least for testing)
 // with a flavor that uses the curve Grumpkin, or a flavor that does/does not have zk, etc.
-template <typename T> concept StandardFlavor = IsAnyOf<T, honk::flavor::Standard>;
+template <typename T>
+concept StandardFlavor = IsAnyOf<T, honk::flavor::Standard>;
 
 template <StandardFlavor Flavor> class StandardProver_ {
 
diff --git a/barretenberg/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp b/barretenberg/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
index bcf665711c0e..9bbe7314f2d1 100644
--- a/barretenberg/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
@@ -13,7 +13,8 @@ namespace proof_system::honk {
 
 // We won't compile this class with honk::flavor::Standard, but we will like want to compile it (at least for testing)
 // with a flavor that uses the curve Grumpkin, or a flavor that does/does not have zk, etc.
-template <typename T> concept UltraFlavor = IsAnyOf<T, honk::flavor::Ultra>;
+template <typename T>
+concept UltraFlavor = IsAnyOf<T, honk::flavor::Ultra>;
 template <UltraFlavor Flavor> class UltraProver_ {
 
     using FF = typename Flavor::FF;
diff --git a/barretenberg/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp b/barretenberg/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
index aa7e516609e4..2b81096c4206 100644
--- a/barretenberg/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
@@ -806,11 +806,12 @@ TEST_F(join_split_tests, test_0_input_notes_and_detect_circuit_change)
 
     // The below part detects any changes in the join-split circuit
 
-    constexpr uint32_t CIRCUIT_GATE_COUNT = 185573;
+    constexpr uint32_t CIRCUIT_GATE_COUNT = 183834;
     constexpr uint32_t GATES_NEXT_POWER_OF_TWO = 524288;
-    const uint256_t VK_HASH("13eb88883e80efb9bf306af2962cd1a49e9fa1b0bfb2d4b563b95217a17bcc74");
+    const uint256_t VK_HASH("5c2e0fe914dbbf23d6bac6ae4db9a7e43d98c0b9d71c9200208dbce24a815c6e");
 
     auto number_of_gates_js = result.number_of_gates;
+    std::cout << get_verification_key()->sha256_hash() << std::endl;
     auto vk_hash_js = get_verification_key()->sha256_hash();
 
     if (!CIRCUIT_CHANGE_EXPECTED) {
diff --git a/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp b/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
index 53c8ff8f9036..703e037e641f 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
@@ -380,13 +380,14 @@ class UltraPlonkComposer {
     };
     // std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
     //     const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const UltraCircuitConstructor::non_native_field_witnesses& input,
         const bool range_constrain_quotient_and_remainder = true)
     {
-        return circuit_constructor.queue_non_native_field_multiplication(input, range_constrain_quotient_and_remainder);
+        return circuit_constructor.evaluate_non_native_field_multiplication(input,
+                                                                            range_constrain_quotient_and_remainder);
     };
-    // std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses&
+    // std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses&
     // input); typedef std::pair<uint32_t, barretenberg::fr> scaled_witness; typedef std::tuple<scaled_witness,
     // scaled_witness, barretenberg::fr> add_simple; std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
     //     add_simple limb0,
diff --git a/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp b/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
index a5f6e09822d4..fe95dfd298a7 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
@@ -781,7 +781,7 @@ TEST(ultra_plonk_composer_splitting_tmp, non_native_field_multiplication)
     UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     auto prover = composer.create_prover();
diff --git a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
index 1f3e9fb3ed33..ce0f4eac7b82 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
@@ -1138,7 +1138,30 @@ std::vector<uint32_t> UltraComposer::decompose_into_default_range(const uint32_t
         const auto limb_idx = add_variable(sublimbs[i]);
         sublimb_indices.emplace_back(limb_idx);
         if ((i == sublimbs.size() - 1) && has_remainder_bits) {
-            create_new_range_constraint(limb_idx, last_limb_range);
+            if ((target_range_bitnum - last_limb_size) < DEFAULT_PLOOKUP_RANGE_CUTOFF_BITNUM) {
+                // we don't want to make a new range table.
+                // X = limb, L = last limb range, K = sublimb mask. L < X
+                // we want X <= L
+                // i.e. L - X >= 0 and L - X <= K
+                // equivalent to saying L - X <= K
+                // D = L - X
+                // D + X - L
+                barretenberg::fr diff = uint256_t(last_limb_range) - get_variable(limb_idx);
+                uint32_t diff_idx = add_variable(diff);
+                create_add_gate({
+                    .a = limb_idx,
+                    .b = zero_idx,
+                    .c = diff_idx,
+                    .a_scaling = 1,
+                    .b_scaling = 0,
+                    .c_scaling = 1,
+                    .const_scaling = -barretenberg::fr(last_limb_range),
+                });
+                create_new_range_constraint(diff_idx, sublimb_mask);
+                create_new_range_constraint(limb_idx, sublimb_mask);
+            } else {
+                create_new_range_constraint(limb_idx, last_limb_range);
+            }
         } else {
             create_new_range_constraint(limb_idx, sublimb_mask);
         }
@@ -1860,22 +1883,18 @@ std::array<uint32_t, 2> UltraComposer::decompose_non_native_field_double_width_l
 }
 
 /**
- * @brief Queue up non-native field multiplication data.
+ * @brief Process a non-native field multiplication data.
  *
- * @details The data queued represents a non-native field multiplication identity a * b = q * p + r,
+ * @details The data represents a non-native field multiplication identity a * b = q * p + r,
  * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
  *
- * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
- * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
- * duplicates in the circuit finishing stage of the proving key computation.
- *
  * The non-native field modulus, p, is a circuit constant
  *
  * The return value are the witness indices of the two remainder limbs `lo_1, hi_2`
  *
  * N.B.: This method does NOT evaluate the prime field component of non-native field multiplications.
  **/
-std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraComposer::evaluate_non_native_field_multiplication(
     const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder)
 {
 
@@ -1903,10 +1922,11 @@ std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
         get_variable(input.r[2]),
         get_variable(input.r[3]),
     };
-
     constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
     constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << (3 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_RSHIFT_2 =
         barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
 
@@ -1955,68 +1975,114 @@ std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
         range_constrain_two_limbs(input.q[2], input.q[3]);
     }
 
-    // Add witnesses into the multiplication cache
-    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
-    cached_non_native_field_multiplication cache_entry{
-        .a = input.a,
-        .b = input.b,
-        .q = input.q,
-        .r = input.r,
-        .cross_terms = { lo_0_idx, lo_1_idx, hi_0_idx, hi_1_idx, hi_2_idx, hi_3_idx },
-        .neg_modulus = input.neg_modulus,
-    };
-    cached_non_native_field_multiplications.emplace_back(cache_entry);
+    // product gate 1
+    // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
+    create_big_add_gate({ input.q[0],
+                          input.q[1],
+                          input.r[1],
+                          lo_1_idx,
+                          input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
+                          input.neg_modulus[0] * LIMB_SHIFT,
+                          -LIMB_SHIFT,
+                          -LIMB_SHIFT.sqr(),
+                          0 },
+                        true);
+
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[0]);
+    w_4.emplace_back(lo_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
+    ++num_gates;
+    w_l.emplace_back(input.a[0]);
+    w_r.emplace_back(input.b[0]);
+    w_o.emplace_back(input.a[3]);
+    w_4.emplace_back(input.b[3]);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
+    ++num_gates;
+    w_l.emplace_back(input.a[2]);
+    w_r.emplace_back(input.b[2]);
+    w_o.emplace_back(input.r[3]);
+    w_4.emplace_back(hi_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
+    ++num_gates;
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[2]);
+    w_4.emplace_back(hi_1_idx);
+    apply_aux_selectors(AUX_SELECTORS::NONE);
+    ++num_gates;
+
+    /**
+     * product gate 6
+     *
+     * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
+     *
+     **/
+    create_big_add_gate(
+        {
+            input.q[2],
+            input.q[3],
+            lo_1_idx,
+            hi_1_idx,
+            -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
+            -input.neg_modulus[0] * LIMB_SHIFT,
+            -1,
+            -1,
+            0,
+        },
+        true);
+
+    /**
+     * product gate 7
+     *
+     * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
+     **/
+    create_big_add_gate({
+        hi_3_idx,
+        input.q[0],
+        input.q[1],
+        hi_2_idx,
+        -1,
+        input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
+        input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
+        LIMB_RSHIFT_2,
+        0,
+    });
 
     return std::array<uint32_t, 2>{ lo_1_idx, hi_3_idx };
 }
 
 /**
  * @brief Called in `compute_proving_key` when finalizing circuit.
- * Iterates over the cached_non_native_field_multiplication objects,
+ * Iterates over the cached_partial_non_native_field_multiplication objects,
  * removes duplicates, and instantiates the remainder as constraints`
  */
 void UltraComposer::process_non_native_field_multiplications()
 {
-    std::sort(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
-
-    auto last =
-        std::unique(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
+    for (size_t i = 0; i < cached_partial_non_native_field_multiplications.size(); ++i) {
+        auto& c = cached_partial_non_native_field_multiplications[i];
+        for (size_t j = 0; j < 5; ++j) {
+            c.a[j] = real_variable_index[c.a[j]];
+            c.b[j] = real_variable_index[c.b[j]];
+        }
+    }
+    std::sort(cached_partial_non_native_field_multiplications.begin(),
+              cached_partial_non_native_field_multiplications.end());
 
-    auto it = cached_non_native_field_multiplications.begin();
+    auto last = std::unique(cached_partial_non_native_field_multiplications.begin(),
+                            cached_partial_non_native_field_multiplications.end());
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
-    constexpr barretenberg::fr LIMB_RSHIFT =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
-    constexpr barretenberg::fr LIMB_RSHIFT_2 =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
+    auto it = cached_partial_non_native_field_multiplications.begin();
 
     // iterate over the cached items and create constraints
     while (it != last) {
         const auto input = *it;
-        const uint32_t lo_0_idx = input.cross_terms.lo_0_idx;
-        const uint32_t lo_1_idx = input.cross_terms.lo_1_idx;
-        const uint32_t hi_0_idx = input.cross_terms.hi_0_idx;
-        const uint32_t hi_1_idx = input.cross_terms.hi_1_idx;
-        const uint32_t hi_2_idx = input.cross_terms.hi_2_idx;
-        const uint32_t hi_3_idx = input.cross_terms.hi_3_idx;
-
-        // product gate 1
-        // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
-        create_big_add_gate({ input.q[0],
-                              input.q[1],
-                              input.r[1],
-                              lo_1_idx,
-                              input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
-                              input.neg_modulus[0] * LIMB_SHIFT,
-                              -LIMB_SHIFT,
-                              -LIMB_SHIFT.sqr(),
-                              0 },
-                            true);
 
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[0]);
-        w_4.emplace_back(lo_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.lo_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
         ++num_gates;
         w_l.emplace_back(input.a[0]);
@@ -2027,65 +2093,34 @@ void UltraComposer::process_non_native_field_multiplications()
         ++num_gates;
         w_l.emplace_back(input.a[2]);
         w_r.emplace_back(input.b[2]);
-        w_o.emplace_back(input.r[3]);
-        w_4.emplace_back(hi_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
         ++num_gates;
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[2]);
-        w_4.emplace_back(hi_1_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_1);
         apply_aux_selectors(AUX_SELECTORS::NONE);
         ++num_gates;
-
-        /**
-         * product gate 6
-         *
-         * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
-         *
-         **/
-        create_big_add_gate(
-            {
-                input.q[2],
-                input.q[3],
-                lo_1_idx,
-                hi_1_idx,
-                -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
-                -input.neg_modulus[0] * LIMB_SHIFT,
-                -1,
-                -1,
-                0,
-            },
-            true);
-
-        /**
-         * product gate 7
-         *
-         * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
-         **/
-        create_big_add_gate({
-            hi_3_idx,
-            input.q[0],
-            input.q[1],
-            hi_2_idx,
-            -1,
-            input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
-            input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
-            LIMB_RSHIFT_2,
-            0,
-        });
         ++it;
     }
 }
 
 /**
- * Compute the limb-multiplication part of a non native field mul
+ * @brief Queue the limb-multiplication part of a non native field mul
  *
  * i.e. compute the low 204 and high 204 bit components of `a * b` where `a, b` are nnf elements composed of 4
  * limbs with size DEFAULT_NON_NATIVE_FIELD_LIMB_BITS
  *
+ * @details The data queued represents part of a non-native field multiplication identity a * b = q * p + r,
+ * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
+ *
+ * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
+ * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
+ * duplicates in the circuit finishing stage of the proving key computation.
  **/
-std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraComposer::queue_partial_non_native_field_multiplication(
     const non_native_field_witnesses& input)
 {
 
@@ -2113,30 +2148,16 @@ std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multipl
     const uint32_t hi_0_idx = add_variable(hi_0);
     const uint32_t hi_1_idx = add_variable(hi_1);
 
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(lo_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
-    ++num_gates;
-    w_l.emplace_back(input.a[0]);
-    w_r.emplace_back(input.b[0]);
-    w_o.emplace_back(input.a[3]);
-    w_4.emplace_back(input.b[3]);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
-    ++num_gates;
-    w_l.emplace_back(input.a[2]);
-    w_r.emplace_back(input.b[2]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
-    ++num_gates;
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_1_idx);
-    apply_aux_selectors(AUX_SELECTORS::NONE);
-    ++num_gates;
+    // Add witnesses into the multiplication cache
+    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
+    cached_partial_non_native_field_multiplication cache_entry{
+        .a = input.a,
+        .b = input.b,
+        .lo_0 = lo_0_idx,
+        .hi_0 = hi_0_idx,
+        .hi_1 = hi_1_idx,
+    };
+    cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
     return std::array<uint32_t, 2>{ lo_0_idx, hi_1_idx };
 }
 
diff --git a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
index 6f22b40bb00a..eb0e5b0e92bb 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
@@ -29,6 +29,11 @@ class UltraComposer : public ComposerBase {
     // large ranges such as 2^64. For such ranges the element will be decomposed into smaller
     // chuncks according to the parameter below
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_BITNUM = 14;
+    // (DEFAULT_PLOOKUP_RANGE_BITNUM - DEFAULT_PLOOKUP_RANGE_CUTOFF_SIZE) = maximum size of range table that
+    // `decompose_into_default_range` will create in addition to the DEFAULT_PLOOKUP_RANGE_BITNUM table e.g. we don't
+    // want to create a range table of size (DEFAULT_PLOOKUP_RANGE_BITNUM - 1) if it contains very few entries; each
+    // table has a O(1 << bitnum) constraint cost to create
+    static constexpr size_t DEFAULT_PLOOKUP_RANGE_CUTOFF_BITNUM = 4;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_STEP_SIZE = 3;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_SIZE = (1 << DEFAULT_PLOOKUP_RANGE_BITNUM) - 1;
     static constexpr size_t DEFAULT_NON_NATIVE_FIELD_LIMB_BITS = 68;
@@ -36,7 +41,7 @@ class UltraComposer : public ComposerBase {
     static constexpr size_t NUMBER_OF_GATES_PER_RAM_ACCESS = 2;
     static constexpr size_t NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY = 1;
     // number of gates created per non-native field operation in process_non_native_field_multiplications
-    static constexpr size_t GATES_PER_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC = 7;
+    static constexpr size_t GATES_PER_PARTIAL_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC = 4;
     struct non_native_field_witnesses {
         // first 4 array elements = limbs
         // 5th element = prime basis limb
@@ -58,30 +63,27 @@ class UltraComposer : public ComposerBase {
     };
 
     /**
-     * @brief Used to store instructions to create non_native_field_multiplication gates.
+     * @brief Used to store instructions to create partial_non_native_field_multiplication gates.
      *        We want to cache these (and remove duplicates) as the stdlib code can end up multiplying the same inputs
      * repeatedly.
      */
-    struct cached_non_native_field_multiplication {
+    struct cached_partial_non_native_field_multiplication {
         std::array<uint32_t, 5> a;
         std::array<uint32_t, 5> b;
-        std::array<uint32_t, 5> q;
-        std::array<uint32_t, 5> r;
-        non_native_field_multiplication_cross_terms cross_terms;
-        std::array<barretenberg::fr, 5> neg_modulus;
+        barretenberg::fr lo_0;
+        barretenberg::fr hi_0;
+        barretenberg::fr hi_1;
 
-        bool operator==(const cached_non_native_field_multiplication& other) const
+        bool operator==(const cached_partial_non_native_field_multiplication& other) const
         {
             bool valid = true;
             for (size_t i = 0; i < 5; ++i) {
                 valid = valid && (a[i] == other.a[i]);
                 valid = valid && (b[i] == other.b[i]);
-                valid = valid && (q[i] == other.q[i]);
-                valid = valid && (r[i] == other.r[i]);
             }
             return valid;
         }
-        bool operator<(const cached_non_native_field_multiplication& other) const
+        bool operator<(const cached_partial_non_native_field_multiplication& other) const
         {
             if (a < other.a) {
                 return true;
@@ -90,22 +92,13 @@ class UltraComposer : public ComposerBase {
                 if (b < other.b) {
                     return true;
                 }
-                if (b == other.b) {
-                    if (q < other.q) {
-                        return true;
-                    }
-                    if (q == other.q) {
-                        if (r < other.r) {
-                            return true;
-                        }
-                    }
-                }
             }
             return false;
         }
     };
 
-    std::vector<cached_non_native_field_multiplication> cached_non_native_field_multiplications;
+    std::vector<cached_partial_non_native_field_multiplication> cached_partial_non_native_field_multiplications;
+
     void process_non_native_field_multiplications();
 
     enum AUX_SELECTORS {
@@ -392,13 +385,23 @@ class UltraComposer : public ComposerBase {
                 rangecount += ram_range_sizes[i];
             }
         }
-        std::vector<cached_non_native_field_multiplication> nnf_copy(cached_non_native_field_multiplications);
+
+        std::vector<cached_partial_non_native_field_multiplication> pnnf_copy(
+            cached_partial_non_native_field_multiplications);
+        for (size_t i = 0; i < pnnf_copy.size(); ++i) {
+            auto& c = pnnf_copy[i];
+            for (size_t j = 0; j < 5; ++j) {
+                c.a[j] = real_variable_index[c.a[j]];
+                c.b[j] = real_variable_index[c.b[j]];
+            }
+        }
         // update nnfcount
-        std::sort(nnf_copy.begin(), nnf_copy.end());
+        std::sort(pnnf_copy.begin(), pnnf_copy.end());
+        auto plast = std::unique(pnnf_copy.begin(), pnnf_copy.end());
 
-        auto last = std::unique(nnf_copy.begin(), nnf_copy.end());
-        const size_t num_nnf_ops = static_cast<size_t>(std::distance(nnf_copy.begin(), last));
-        nnfcount = num_nnf_ops * GATES_PER_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC;
+        nnfcount = static_cast<size_t>(std::distance(pnnf_copy.begin(), plast)) *
+                   GATES_PER_PARTIAL_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC;
+        ;
     }
 
     /**
@@ -545,9 +548,9 @@ class UltraComposer : public ComposerBase {
                                    const size_t hi_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
         const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder = true);
-    std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
+    std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
     typedef std::pair<uint32_t, barretenberg::fr> scaled_witness;
     typedef std::tuple<scaled_witness, scaled_witness, barretenberg::fr> add_simple;
     std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
diff --git a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
index 0ec324aa474e..0ac6de40dc5f 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
@@ -649,7 +649,7 @@ TYPED_TEST(ultra_composer, non_native_field_multiplication)
     UltraComposer::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     TestFixture::prove_and_verify(composer, /*expected_result=*/true);
diff --git a/barretenberg/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp b/barretenberg/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
index f6cf92bd7327..e66d3b56b8c6 100644
--- a/barretenberg/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
+++ b/barretenberg/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
@@ -51,43 +51,34 @@ barretenberg::fr compress_native_evaluation_domain(barretenberg::evaluation_doma
  */
 barretenberg::fr verification_key_data::compress_native(const size_t hash_index)
 {
-    barretenberg::evaluation_domain domain = evaluation_domain(circuit_size);
-    barretenberg::fr compressed_domain =
-        compress_native_evaluation_domain(domain, proof_system::ComposerType(composer_type));
-
-    constexpr size_t num_limb_bits = plonk::NUM_LIMB_BITS_IN_FIELD_SIMULATION;
-
-    const auto split_bigfield_limbs = [](const uint256_t& element) {
-        std::vector<barretenberg::fr> limbs;
-        limbs.push_back(element.slice(0, num_limb_bits));
-        limbs.push_back(element.slice(num_limb_bits, num_limb_bits * 2));
-        limbs.push_back(element.slice(num_limb_bits * 2, num_limb_bits * 3));
-        limbs.push_back(element.slice(num_limb_bits * 3, num_limb_bits * 4));
-        return limbs;
-    };
-
-    std::vector<barretenberg::fr> preimage_data;
-    preimage_data.emplace_back(composer_type);
-    preimage_data.emplace_back(compressed_domain);
-    preimage_data.emplace_back(num_public_inputs);
+    barretenberg::evaluation_domain eval_domain = evaluation_domain(circuit_size);
+
+    std::vector<uint8_t> preimage_data;
+
+    preimage_data.push_back(static_cast<uint8_t>(proof_system::ComposerType(composer_type)));
+
+    const uint256_t domain = eval_domain.domain;
+    const uint256_t generator = eval_domain.generator;
+    const uint256_t public_inputs = num_public_inputs;
+
+    ASSERT(domain < (uint256_t(1) << 32));
+    ASSERT(generator < (uint256_t(1) << 16));
+    ASSERT(public_inputs < (uint256_t(1) << 32));
+
+    write(preimage_data, static_cast<uint16_t>(uint256_t(generator)));
+    write(preimage_data, static_cast<uint32_t>(uint256_t(domain)));
+    write(preimage_data, static_cast<uint32_t>(public_inputs));
     for (const auto& [tag, selector] : commitments) {
-        const auto x_limbs = split_bigfield_limbs(selector.x);
-        const auto y_limbs = split_bigfield_limbs(selector.y);
-
-        preimage_data.push_back(x_limbs[0]);
-        preimage_data.push_back(x_limbs[1]);
-        preimage_data.push_back(x_limbs[2]);
-        preimage_data.push_back(x_limbs[3]);
-
-        preimage_data.push_back(y_limbs[0]);
-        preimage_data.push_back(y_limbs[1]);
-        preimage_data.push_back(y_limbs[2]);
-        preimage_data.push_back(y_limbs[3]);
+        write(preimage_data, selector.y);
+        write(preimage_data, selector.x);
     }
 
+    write(preimage_data, eval_domain.root);
+
     barretenberg::fr compressed_key;
-    if (proof_system::ComposerType(composer_type) == proof_system::ComposerType::PLOOKUP) {
-        compressed_key = crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index);
+    if (proof_system::ComposerType(composer_type) == ComposerType::PLOOKUP) {
+        compressed_key = from_buffer<barretenberg::fr>(
+            crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index));
     } else {
         compressed_key = crypto::pedersen_commitment::compress_native(preimage_data, hash_index);
     }
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
index 569ad0ca7f21..52812f354c9f 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
@@ -1355,7 +1355,7 @@ std::array<uint32_t, 2> UltraCircuitConstructor::decompose_non_native_field_doub
  * @details The data queued represents a non-native field multiplication identity a * b = q * p + r,
  * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
  *
- * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
+ * Without this queue some functions, such as proof_system::plonk::stdlib::element::multiple_montgomery_ladder, would
  * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
  * duplicates in the circuit finishing stage of the proving key computation.
  *
@@ -1365,7 +1365,7 @@ std::array<uint32_t, 2> UltraCircuitConstructor::decompose_non_native_field_doub
  *
  * N.B.: This method does NOT evaluate the prime field component of non-native field multiplications.
  **/
-std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_non_native_field_multiplication(
     const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder)
 {
 
@@ -1393,10 +1393,11 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
         get_variable(input.r[2]),
         get_variable(input.r[3]),
     };
-
     constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
     constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << (3 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_RSHIFT_2 =
         barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
 
@@ -1444,17 +1445,81 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
         range_constrain_two_limbs(input.q[0], input.q[1]);
         range_constrain_two_limbs(input.q[2], input.q[3]);
     }
-    // Add witnesses into the multiplication cache
-    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
-    cached_non_native_field_multiplication cache_entry{
-        .a = input.a,
-        .b = input.b,
-        .q = input.q,
-        .r = input.r,
-        .cross_terms = { lo_0_idx, lo_1_idx, hi_0_idx, hi_1_idx, hi_2_idx, hi_3_idx },
-        .neg_modulus = input.neg_modulus,
-    };
-    cached_non_native_field_multiplications.emplace_back(cache_entry);
+
+    // product gate 1
+    // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
+    create_big_add_gate({ input.q[0],
+                          input.q[1],
+                          input.r[1],
+                          lo_1_idx,
+                          input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
+                          input.neg_modulus[0] * LIMB_SHIFT,
+                          -LIMB_SHIFT,
+                          -LIMB_SHIFT.sqr(),
+                          0 },
+                        true);
+
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[0]);
+    w_4.emplace_back(lo_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
+    ++num_gates;
+    w_l.emplace_back(input.a[0]);
+    w_r.emplace_back(input.b[0]);
+    w_o.emplace_back(input.a[3]);
+    w_4.emplace_back(input.b[3]);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
+    ++num_gates;
+    w_l.emplace_back(input.a[2]);
+    w_r.emplace_back(input.b[2]);
+    w_o.emplace_back(input.r[3]);
+    w_4.emplace_back(hi_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
+    ++num_gates;
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[2]);
+    w_4.emplace_back(hi_1_idx);
+    apply_aux_selectors(AUX_SELECTORS::NONE);
+    ++num_gates;
+
+    /**
+     * product gate 6
+     *
+     * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
+     *
+     **/
+    create_big_add_gate(
+        {
+            input.q[2],
+            input.q[3],
+            lo_1_idx,
+            hi_1_idx,
+            -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
+            -input.neg_modulus[0] * LIMB_SHIFT,
+            -1,
+            -1,
+            0,
+        },
+        true);
+
+    /**
+     * product gate 7
+     *
+     * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
+     **/
+    create_big_add_gate({
+        hi_3_idx,
+        input.q[0],
+        input.q[1],
+        hi_2_idx,
+        -1,
+        input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
+        input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
+        LIMB_RSHIFT_2,
+        0,
+    });
 
     return std::array<uint32_t, 2>{ lo_1_idx, hi_3_idx };
 }
@@ -1466,46 +1531,29 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
  */
 void UltraCircuitConstructor::process_non_native_field_multiplications()
 {
-    std::sort(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
-
-    auto last =
-        std::unique(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
+    for (size_t i = 0; i < cached_partial_non_native_field_multiplications.size(); ++i) {
+        auto& c = cached_partial_non_native_field_multiplications[i];
+        for (size_t j = 0; j < 5; ++j) {
+            c.a[j] = real_variable_index[c.a[j]];
+            c.b[j] = real_variable_index[c.b[j]];
+        }
+    }
+    std::sort(cached_partial_non_native_field_multiplications.begin(),
+              cached_partial_non_native_field_multiplications.end());
 
-    auto it = cached_non_native_field_multiplications.begin();
+    auto last = std::unique(cached_partial_non_native_field_multiplications.begin(),
+                            cached_partial_non_native_field_multiplications.end());
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
-    constexpr barretenberg::fr LIMB_RSHIFT =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
-    constexpr barretenberg::fr LIMB_RSHIFT_2 =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
+    auto it = cached_partial_non_native_field_multiplications.begin();
 
     // iterate over the cached items and create constraints
     while (it != last) {
         const auto input = *it;
-        const uint32_t lo_0_idx = input.cross_terms.lo_0_idx;
-        const uint32_t lo_1_idx = input.cross_terms.lo_1_idx;
-        const uint32_t hi_0_idx = input.cross_terms.hi_0_idx;
-        const uint32_t hi_1_idx = input.cross_terms.hi_1_idx;
-        const uint32_t hi_2_idx = input.cross_terms.hi_2_idx;
-        const uint32_t hi_3_idx = input.cross_terms.hi_3_idx;
-
-        // product gate 1
-        // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
-        create_big_add_gate({ input.q[0],
-                              input.q[1],
-                              input.r[1],
-                              lo_1_idx,
-                              input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
-                              input.neg_modulus[0] * LIMB_SHIFT,
-                              -LIMB_SHIFT,
-                              -LIMB_SHIFT.sqr(),
-                              0 },
-                            true);
 
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[0]);
-        w_4.emplace_back(lo_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.lo_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
         ++num_gates;
         w_l.emplace_back(input.a[0]);
@@ -1516,53 +1564,16 @@ void UltraCircuitConstructor::process_non_native_field_multiplications()
         ++num_gates;
         w_l.emplace_back(input.a[2]);
         w_r.emplace_back(input.b[2]);
-        w_o.emplace_back(input.r[3]);
-        w_4.emplace_back(hi_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
         ++num_gates;
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[2]);
-        w_4.emplace_back(hi_1_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_1);
         apply_aux_selectors(AUX_SELECTORS::NONE);
         ++num_gates;
-
-        /**
-         * product gate 6
-         *
-         * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
-         *
-         **/
-        create_big_add_gate(
-            {
-                input.q[2],
-                input.q[3],
-                lo_1_idx,
-                hi_1_idx,
-                -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
-                -input.neg_modulus[0] * LIMB_SHIFT,
-                -1,
-                -1,
-                0,
-            },
-            true);
-
-        /**
-         * product gate 7
-         *
-         * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
-         **/
-        create_big_add_gate({
-            hi_3_idx,
-            input.q[0],
-            input.q[1],
-            hi_2_idx,
-            -1,
-            input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
-            input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
-            LIMB_RSHIFT_2,
-            0,
-        });
         ++it;
     }
 }
@@ -1574,7 +1585,7 @@ void UltraCircuitConstructor::process_non_native_field_multiplications()
  * limbs with size DEFAULT_NON_NATIVE_FIELD_LIMB_BITS
  *
  **/
-std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_partial_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraCircuitConstructor::queue_partial_non_native_field_multiplication(
     const non_native_field_witnesses& input)
 {
 
@@ -1602,30 +1613,16 @@ std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_partial_non_native_fie
     const uint32_t hi_0_idx = add_variable(hi_0);
     const uint32_t hi_1_idx = add_variable(hi_1);
 
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(lo_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
-    ++num_gates;
-    w_l.emplace_back(input.a[0]);
-    w_r.emplace_back(input.b[0]);
-    w_o.emplace_back(input.a[3]);
-    w_4.emplace_back(input.b[3]);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
-    ++num_gates;
-    w_l.emplace_back(input.a[2]);
-    w_r.emplace_back(input.b[2]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
-    ++num_gates;
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_1_idx);
-    apply_aux_selectors(AUX_SELECTORS::NONE);
-    ++num_gates;
+    // Add witnesses into the multiplication cache
+    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
+    cached_partial_non_native_field_multiplication cache_entry{
+        .a = input.a,
+        .b = input.b,
+        .lo_0 = lo_0_idx,
+        .hi_0 = hi_0_idx,
+        .hi_1 = hi_1_idx,
+    };
+    cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
     return std::array<uint32_t, 2>{ lo_0_idx, hi_1_idx };
 }
 
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
index 84b18b13bcd4..21bc95239328 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
@@ -167,45 +167,29 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
         }
     };
 
-    inline std::vector<std::string> ultra_selector_names()
-    {
-        std::vector<std::string> result{ "q_m",     "q_c",    "q_1",        "q_2",   "q_3",       "q_4",
-                                         "q_arith", "q_sort", "q_elliptic", "q_aux", "table_type" };
-        return result;
-    }
-    struct non_native_field_multiplication_cross_terms {
-        uint32_t lo_0_idx;
-        uint32_t lo_1_idx;
-        uint32_t hi_0_idx;
-        uint32_t hi_1_idx;
-        uint32_t hi_2_idx;
-        uint32_t hi_3_idx;
-    };
     /**
-     * @brief Used to store instructions to create non_native_field_multiplication gates.
+     * @brief Used to store instructions to create partial_non_native_field_multiplication gates.
      *        We want to cache these (and remove duplicates) as the stdlib code can end up multiplying the same inputs
      * repeatedly.
      */
-    struct cached_non_native_field_multiplication {
+    struct cached_partial_non_native_field_multiplication {
         std::array<uint32_t, 5> a;
         std::array<uint32_t, 5> b;
-        std::array<uint32_t, 5> q;
-        std::array<uint32_t, 5> r;
-        non_native_field_multiplication_cross_terms cross_terms;
-        std::array<barretenberg::fr, 5> neg_modulus;
+        barretenberg::fr lo_0;
+        barretenberg::fr hi_0;
+        barretenberg::fr hi_1;
 
-        bool operator==(const cached_non_native_field_multiplication& other) const
+        bool operator==(const cached_partial_non_native_field_multiplication& other) const
         {
             bool valid = true;
             for (size_t i = 0; i < 5; ++i) {
                 valid = valid && (a[i] == other.a[i]);
                 valid = valid && (b[i] == other.b[i]);
-                valid = valid && (q[i] == other.q[i]);
-                valid = valid && (r[i] == other.r[i]);
             }
             return valid;
         }
-        bool operator<(const cached_non_native_field_multiplication& other) const
+
+        bool operator<(const cached_partial_non_native_field_multiplication& other) const
         {
             if (a < other.a) {
                 return true;
@@ -214,27 +198,33 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
                 if (b < other.b) {
                     return true;
                 }
-                if (b == other.b) {
-                    if (q < other.q) {
-                        return true;
-                    }
-                    if (q == other.q) {
-                        if (r < other.r) {
-                            return true;
-                        }
-                    }
-                }
             }
             return false;
         }
     };
+
+    inline std::vector<std::string> ultra_selector_names()
+    {
+        std::vector<std::string> result{ "q_m",     "q_c",    "q_1",        "q_2",   "q_3",       "q_4",
+                                         "q_arith", "q_sort", "q_elliptic", "q_aux", "table_type" };
+        return result;
+    }
+    struct non_native_field_multiplication_cross_terms {
+        uint32_t lo_0_idx;
+        uint32_t lo_1_idx;
+        uint32_t hi_0_idx;
+        uint32_t hi_1_idx;
+        uint32_t hi_2_idx;
+        uint32_t hi_3_idx;
+    };
+
     /**
-     * @brief CircuitDataBackup is a structure we use to store all the information about the circuit that is needed to
-     * restore it back to a pre-finalized state
-     * @details In check_circuit method in UltraCircuitConstructor we want to check that the whole circuit works, but
-     * ultra circuits need to have ram, rom and range gates added in the end for the check to be complete as well as the
-     * set permutation check, so we finalize the circuit when we check it. This structure allows us to restore the
-     * circuit to the state before the finalization.
+     * @brief CircuitDataBackup is a structure we use to store all the information about the circuit that is needed
+     * to restore it back to a pre-finalized state
+     * @details In check_circuit method in UltraCircuitConstructor we want to check that the whole circuit works,
+     * but ultra circuits need to have ram, rom and range gates added in the end for the check to be complete as
+     * well as the set permutation check, so we finalize the circuit when we check it. This structure allows us to
+     * restore the circuit to the state before the finalization.
      */
     struct CircuitDataBackup {
         std::vector<uint32_t> public_inputs;
@@ -272,8 +262,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
         std::vector<uint32_t> memory_write_records;
         std::map<uint64_t, RangeList> range_lists;
 
-        std::vector<UltraCircuitConstructor::cached_non_native_field_multiplication>
-            cached_non_native_field_multiplications;
+        std::vector<UltraCircuitConstructor::cached_partial_non_native_field_multiplication>
+            cached_partial_non_native_field_multiplications;
 
         size_t num_gates;
         bool circuit_finalised = false;
@@ -326,14 +316,14 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             stored_state.range_lists = circuit_constructor.range_lists;
             stored_state.circuit_finalised = circuit_constructor.circuit_finalised;
             stored_state.num_gates = circuit_constructor.num_gates;
-            stored_state.cached_non_native_field_multiplications =
-                circuit_constructor.cached_non_native_field_multiplications;
+            stored_state.cached_partial_non_native_field_multiplications =
+                circuit_constructor.cached_partial_non_native_field_multiplications;
             return stored_state;
         }
 
         /**
-         * @brief Stores the state of all members of the circuit constructor that are needed to restore the state after
-         * finalizing the circuit.
+         * @brief Stores the state of all members of the circuit constructor that are needed to restore the state
+         * after finalizing the circuit.
          *
          * @param circuit_constructor
          * @return CircuitDataBackup
@@ -363,8 +353,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             stored_state.range_lists = circuit_constructor->range_lists;
             stored_state.circuit_finalised = circuit_constructor->circuit_finalised;
             stored_state.num_gates = circuit_constructor->num_gates;
-            stored_state.cached_non_native_field_multiplications =
-                circuit_constructor->cached_non_native_field_multiplications;
+            stored_state.cached_partial_non_native_field_multiplications =
+                circuit_constructor->cached_partial_non_native_field_multiplications;
 
             return stored_state;
         }
@@ -398,7 +388,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             circuit_constructor->range_lists = range_lists;
             circuit_constructor->circuit_finalised = circuit_finalised;
             circuit_constructor->num_gates = num_gates;
-            circuit_constructor->cached_non_native_field_multiplications = cached_non_native_field_multiplications;
+            circuit_constructor->cached_partial_non_native_field_multiplications =
+                cached_partial_non_native_field_multiplications;
             circuit_constructor->w_l.resize(num_gates);
             circuit_constructor->w_r.resize(num_gates);
             circuit_constructor->w_o.resize(num_gates);
@@ -511,8 +502,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             if (!(range_lists == circuit_constructor.range_lists)) {
                 return false;
             }
-            if (!(cached_non_native_field_multiplications ==
-                  circuit_constructor.cached_non_native_field_multiplications)) {
+            if (!(cached_partial_non_native_field_multiplications ==
+                  circuit_constructor.cached_partial_non_native_field_multiplications)) {
                 return false;
             }
             if (!(num_gates == circuit_constructor.num_gates)) {
@@ -555,7 +546,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
      * @brief Each entry in ram_arrays represents an independent RAM table.
      * RamTranscript tracks the current table state,
      * as well as the 'records' produced by each read and write operation.
-     * Used in `compute_proving_key` to generate consistency check gates required to validate the RAM read/write history
+     * Used in `compute_proving_key` to generate consistency check gates required to validate the RAM read/write
+     * history
      */
     std::vector<RamTranscript> ram_arrays;
 
@@ -572,7 +564,7 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     // Stores gate index of RAM writes (required by proving key)
     std::vector<uint32_t> memory_write_records;
 
-    std::vector<cached_non_native_field_multiplication> cached_non_native_field_multiplications;
+    std::vector<cached_partial_non_native_field_multiplication> cached_partial_non_native_field_multiplications;
 
     void process_non_native_field_multiplications();
 
@@ -638,11 +630,11 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             /**
              * N.B. if `variable_index` is not used in any arithmetic constraints, this will create an unsatisfiable
              *      circuit!
-             *      this range constraint will increase the size of the 'sorted set' of range-constrained integers by 1.
-             *      The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all arithmetic
-             *      gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking for this
-             *      and throwing an error would require a refactor of the Composer to catelog all 'orphan' variables not
-             *      assigned to gates.
+             *      this range constraint will increase the size of the 'sorted set' of range-constrained integers
+             *by 1. The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all
+             *arithmetic gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking
+             *for this and throwing an error would require a refactor of the Composer to catelog all 'orphan'
+             *variables not assigned to gates.
              **/
             create_new_range_constraint(variable_index, 1ULL << num_bits, msg);
         } else {
@@ -704,8 +696,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     //             }
     //         }
     //         ramcount += (ram_arrays[i].records.size() * NUMBER_OF_GATES_PER_RAM_ACCESS);
-    //         ramcount += NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY; // we add an addition gate after procesing a ram
-    //         array
+    //         ramcount += NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY; // we add an addition gate after procesing a
+    //         ram array
 
     //         // there will be 'max_timestamp' number of range checks, need to calculate.
     //         const auto max_timestamp = ram_arrays[i].access_count - 1;
@@ -719,7 +711,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     //         const size_t ram_range_check_list_size = max_timestamp + padding;
 
     //         size_t ram_range_check_gate_count = (ram_range_check_list_size / gate_width);
-    //         ram_range_check_gate_count += 1; // we need to add 1 extra addition gates for every distinct range list
+    //         ram_range_check_gate_count += 1; // we need to add 1 extra addition gates for every distinct range
+    //         list
 
     //         ram_range_sizes.push_back(ram_range_check_gate_count);
     //         ram_range_exists.push_back(false);
@@ -885,9 +878,9 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
                                    const size_t hi_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
         const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder = true);
-    std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
+    std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
     typedef std::pair<uint32_t, barretenberg::fr> scaled_witness;
     typedef std::tuple<scaled_witness, scaled_witness, barretenberg::fr> add_simple;
     std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
diff --git a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
index aea6e78bde7b..1b9706e75690 100644
--- a/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
@@ -643,7 +643,7 @@ TEST(ultra_circuit_constructor, non_native_field_multiplication)
     proof_system::UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = circuit_constructor.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = circuit_constructor.evaluate_non_native_field_multiplication(inputs);
     circuit_constructor.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     auto saved_state = UltraCircuitConstructor::CircuitDataBackup::store_full_state(circuit_constructor);
diff --git a/barretenberg/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp b/barretenberg/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
index a6dcc0b87c30..7274600b8497 100644
--- a/barretenberg/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
+++ b/barretenberg/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
@@ -125,7 +125,7 @@ template <typename OuterComposer> class RecursiveCircuit {
             throw_or_abort("inner proof result != 1");
         }
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         if (outer_composer.failed()) {
             throw_or_abort("outer composer failed");
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp b/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
index 6dbc70e5d178..c5ba72f8d35b 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
@@ -14,19 +14,34 @@ using namespace plookup;
 using namespace barretenberg;
 
 template <typename C>
-point<C> pedersen_plookup_commitment<C>::compress_to_point(const field_t& left, const field_t& right)
+point<C> pedersen_plookup_commitment<C>::compress_to_point(const field_t& left,
+                                                           const field_t& right,
+                                                           const bool skip_rhs_range_check)
 {
     auto p2 = pedersen_plookup_hash<C>::hash_single(left, false);
-    auto p1 = pedersen_plookup_hash<C>::hash_single(right, true);
+    auto p1 = pedersen_plookup_hash<C>::hash_single(right, true, skip_rhs_range_check);
 
     return pedersen_plookup_hash<C>::add_points(p1, p2);
 }
 
-template <typename C> field_t<C> pedersen_plookup_commitment<C>::compress(const field_t& left, const field_t& right)
+template <typename C>
+field_t<C> pedersen_plookup_commitment<C>::compress(const field_t& left,
+                                                    const field_t& right,
+                                                    const bool skip_rhs_range_check)
 {
-    return compress_to_point(left, right).x;
+    return compress_to_point(left, right, skip_rhs_range_check).x;
 }
 
+/**
+ * @brief Compress a vector of field elements into a grumpkin point.
+ * This serves as the basis for a collision-resistant hash function.
+ * Note that this does NOT produce a hash that can be modelled as a random oracle.
+ *
+ * @tparam C
+ * @param inputs
+ * @param iv initialization vector
+ * @return point<C>
+ */
 template <typename C>
 point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vector<field_t>& inputs, const field_t& iv)
 {
@@ -34,13 +49,19 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
         return point{ 0, 0 };
     }
 
+    // The first two inputs to the Merkle-Damgard construction are the initialization vector and the number of elements
+    // being hashed. Including the length ensures that hashes of different lengths cannot collide. Starting the hash
+    // with these 2 inputs is optimal in the case that the IV is constant. i.e. the 1st 3 calls to `hash_single` are
+    // over constants and cost no constraints. r = H(iv, num_inputs) is constant and the 1st half of H(r, inputs[0]) is
+    // also constant
     auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, iv)[ColumnIdx::C2][0];
     auto num_inputs = inputs.size();
-    for (size_t i = 0; i < num_inputs; i++) {
+    result = compress(result, field_t(num_inputs));
+    for (size_t i = 0; i < num_inputs - 1; i++) {
         result = compress(result, inputs[i]);
     }
 
-    return compress_to_point(result, field_t(num_inputs));
+    return compress_to_point(result, inputs[num_inputs - 1]);
 }
 
 template <typename C>
@@ -53,7 +74,9 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
     }
 
     auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, 0)[ColumnIdx::C2][0];
-    for (size_t i = 0; i < 2 * num_inputs; i++) {
+    result = compress(result, field_t(num_inputs));
+
+    for (size_t i = 0; i < 2 * num_inputs - 1; i++) {
         if ((i & 1) == 0) {
             auto iv_result =
                 plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, ivs[i >> 1])[ColumnIdx::C2][0];
@@ -63,7 +86,25 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
         }
     }
 
-    return compress_to_point(result, field_t(num_inputs));
+    return compress_to_point(result, inputs[num_inputs - 1]);
+}
+
+template <typename C>
+point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress_with_relaxed_range_constraints(
+    const std::vector<field_t>& inputs, const field_t& iv)
+{
+    if (inputs.size() == 0) {
+        return point{ 0, 0 };
+    }
+
+    auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, iv)[ColumnIdx::C2][0];
+    auto num_inputs = inputs.size();
+    result = compress(result, field_t(num_inputs));
+    for (size_t i = 0; i < num_inputs - 1; i++) {
+        result = compress(result, inputs[i], true);
+    }
+
+    return compress_to_point(result, inputs[num_inputs - 1], true);
 }
 
 template <typename C>
@@ -102,6 +143,13 @@ point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inpu
     return merkle_damgard_compress(inputs, field_t(hash_index));
 }
 
+template <typename C>
+point<C> pedersen_plookup_commitment<C>::commit_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                               const size_t hash_index)
+{
+    return merkle_damgard_compress_with_relaxed_range_constraints(inputs, field_t(hash_index));
+}
+
 template <typename C>
 point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inputs,
                                                 const std::vector<size_t>& hash_indices)
@@ -114,6 +162,24 @@ point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inpu
     return merkle_damgard_compress(inputs, hash_indices_);
 }
 
+/**
+ * @brief Calls `compress` but instructs the Pedersen hash method `hash_single`
+ * to not apply range constraints on the input elements.
+ *
+ * Use this method when the input elements are known to be <= 2^252
+ *
+ * @tparam C
+ * @param inputs
+ * @param hash_index
+ * @return field_t<C>
+ */
+template <typename C>
+field_t<C> pedersen_plookup_commitment<C>::compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                                   const size_t hash_index)
+{
+    return commit_with_relaxed_range_constraints(inputs, hash_index).x;
+}
+
 template <typename C>
 field_t<C> pedersen_plookup_commitment<C>::compress(const std::vector<field_t>& inputs, const size_t hash_index)
 {
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp b/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
index 42cc8252d239..90076a3035f2 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
@@ -17,14 +17,18 @@ template <typename ComposerContext> class pedersen_plookup_commitment {
   public:
     static point commit(const std::vector<field_t>& inputs, const size_t hash_index = 0);
     static point commit(const std::vector<field_t>& inputs, const std::vector<size_t>& hash_indices);
+    static point commit_with_relaxed_range_constraints(const std::vector<field_t>& inputs, const size_t hash_index = 0);
 
-    static field_t compress(const field_t& left, const field_t& right);
+    static field_t compress(const field_t& left, const field_t& right, const bool skip_rhs_range_check = false);
     static field_t compress(const std::vector<field_t>& inputs, const size_t hash_index = 0);
     static field_t compress(const packed_byte_array& input) { return compress(input.get_limbs()); }
 
     static field_t compress(const std::vector<field_t>& inputs, const std::vector<size_t>& hash_indices);
     static field_t compress(const std::vector<std::pair<field_t, size_t>>& input_pairs);
 
+    static field_t compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                           const size_t hash_index = 0);
+
     template <size_t T> static field_t compress(const std::array<field_t, T>& inputs)
     {
         std::vector<field_t> in(inputs.begin(), inputs.end());
@@ -33,9 +37,12 @@ template <typename ComposerContext> class pedersen_plookup_commitment {
 
     static point merkle_damgard_compress(const std::vector<field_t>& inputs, const field_t& iv);
     static point merkle_damgard_compress(const std::vector<field_t>& inputs, const std::vector<field_t>& ivs);
+    static point merkle_damgard_compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                        const field_t& iv);
+
     static point merkle_damgard_tree_compress(const std::vector<field_t>& inputs, const std::vector<field_t>& ivs);
 
-    static point compress_to_point(const field_t& left, const field_t& right);
+    static point compress_to_point(const field_t& left, const field_t& right, const bool skip_rhs_range_check = false);
 };
 
 extern template class pedersen_plookup_commitment<UltraComposer>;
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp b/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
index 2801648f540d..9b5e84cda0cb 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
@@ -79,7 +79,8 @@ point<C> pedersen_plookup_hash<C>::add_points(const point& p1, const point& p2,
 /**
  * Hash a single field element using lookup tables.
  */
-template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field_t& scalar, const bool parity)
+template <typename C>
+point<C> pedersen_plookup_hash<C>::hash_single(const field_t& scalar, const bool parity, const bool skip_range_check)
 {
     if (scalar.is_constant()) {
         C* ctx = scalar.get_context();
@@ -93,6 +94,10 @@ template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field
     const field_t y_lo = witness_t(ctx, uint256_t(scalar.get_value()).slice(0, 126));
 
     ReadData<field_t> lookup_hi, lookup_lo;
+
+    // If `skip_range_check = true`, this implies the input scalar is 252 bits maximum.
+    // i.e. we do not require a check that scalar slice sums < p .
+    // We can also likely use a multitable with 1 less lookup
     if (parity) {
         lookup_lo = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_RIGHT_LO, y_lo);
         lookup_hi = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_RIGHT_HI, y_hi);
@@ -101,17 +106,35 @@ template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field
         lookup_hi = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_LEFT_HI, y_hi);
     }
 
-    // Check if (r_hi - y_hi) is 128 bits and if (r_hi - y_hi) == 0, then
-    // (r_lo - y_lo) must be 126 bits.
-    constexpr uint256_t modulus = fr::modulus;
-    const field_t r_lo = witness_t(ctx, modulus.slice(0, 126));
-    const field_t r_hi = witness_t(ctx, modulus.slice(126, 256));
+    // validate slices equal scalar
+    // TODO(suyash?): can remove this gate if we use a single lookup accumulator for HI + LO combined
+    //       can recover y_hi, y_lo from Column 1 of the the lookup accumulator output
+    scalar.add_two(-y_hi * (uint256_t(1) << 126), -y_lo).assert_equal(0);
+
+    // if skip_range_check = true we assume input max size is 252 bits => final lookup scalar slice value must be 0
+    if (skip_range_check) {
+        lookup_hi[ColumnIdx::C1][lookup_hi[ColumnIdx::C1].size() - 1].assert_equal(0);
+    }
+    if (!skip_range_check) {
+        // Check that y_hi * 2^126 + y_lo < fr::modulus when evaluated over the integers
+        constexpr uint256_t modulus = fr::modulus;
+        const field_t r_lo = field_t(ctx, modulus.slice(0, 126));
+        const field_t r_hi = field_t(ctx, modulus.slice(126, 256));
 
-    const field_t term_hi = r_hi - y_hi;
-    const field_t term_lo = (r_lo - y_lo) * field_t(term_hi == field_t(0));
-    term_hi.normalize().create_range_constraint(128);
-    term_lo.normalize().create_range_constraint(126);
+        bool need_borrow = (uint256_t(y_lo.get_value()) > uint256_t(r_lo.get_value()));
+        field_t borrow = field_t::from_witness(ctx, need_borrow);
 
+        // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+        scalar.get_context()->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+
+        // Hi range check = r_hi - y_hi - borrow
+        // Lo range check = r_lo - y_lo + borrow * 2^{126}
+        field_t hi = (r_hi - y_hi) - borrow;
+        field_t lo = (r_lo - y_lo) + (borrow * (uint256_t(1) << 126));
+
+        hi.create_range_constraint(128);
+        lo.create_range_constraint(126);
+    }
     const size_t num_lookups_lo = lookup_lo[ColumnIdx::C1].size();
     const size_t num_lookups_hi = lookup_hi[ColumnIdx::C1].size();
 
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp b/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
index 5d099c5c3fa8..2467e3fdb57e 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
@@ -23,7 +23,7 @@ template <typename ComposerContext> class pedersen_plookup_hash {
   public:
     static point add_points(const point& p1, const point& p2, const AddType add_type = ONE);
 
-    static point hash_single(const field_t& in, const bool parity);
+    static point hash_single(const field_t& in, const bool parity, const bool skip_range_check = false);
 
     static field_t hash_multiple(const std::vector<field_t>& in, const size_t hash_index = 0);
 };
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
index d324bcc4aff9..96d4c022df51 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
@@ -593,18 +593,19 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator-(const
     result.binary_basis_limbs[3].element = binary_basis_limbs[3].element + barretenberg::fr(to_add_3);
 
     if constexpr (C::type == ComposerType::PLOOKUP) {
-        if (result.prime_basis_limb.multiplicative_constant == 1 &&
-            other.prime_basis_limb.multiplicative_constant == 1 && !result.is_constant() && !other.is_constant()) {
+        if (prime_basis_limb.multiplicative_constant == 1 && other.prime_basis_limb.multiplicative_constant == 1 &&
+            !is_constant() && !other.is_constant()) {
             bool limbconst = result.binary_basis_limbs[0].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[1].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[2].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[3].element.is_constant();
-            limbconst = limbconst || result.prime_basis_limb.is_constant();
+            limbconst = limbconst || prime_basis_limb.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[0].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[1].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[2].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[3].element.is_constant();
             limbconst = limbconst || other.prime_basis_limb.is_constant();
+            limbconst = limbconst || (prime_basis_limb.witness_index == other.prime_basis_limb.witness_index);
             if (!limbconst) {
                 std::pair<uint32_t, barretenberg::fr> x0{ result.binary_basis_limbs[0].element.witness_index,
                                                           binary_basis_limbs[0].element.multiplicative_constant };
@@ -631,10 +632,11 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator-(const
                 barretenberg::fr c3(result.binary_basis_limbs[3].element.additive_constant -
                                     other.binary_basis_limbs[3].element.additive_constant);
 
-                uint32_t xp(result.prime_basis_limb.witness_index);
+                uint32_t xp(prime_basis_limb.witness_index);
                 uint32_t yp(other.prime_basis_limb.witness_index);
-                barretenberg::fr cp(result.prime_basis_limb.additive_constant -
-                                    other.prime_basis_limb.additive_constant);
+                barretenberg::fr cp(prime_basis_limb.additive_constant - other.prime_basis_limb.additive_constant);
+                uint512_t constant_to_add_mod_p = (constant_to_add) % prime_basis.modulus;
+                cp += barretenberg::fr(constant_to_add_mod_p.lo);
 
                 const auto output_witnesses = ctx->evaluate_non_native_field_subtraction(
                     { x0, y0, c0 }, { x1, y1, c1 }, { x2, y2, c2 }, { x3, y3, c3 }, { xp, yp, cp });
@@ -1982,7 +1984,7 @@ void bigfield<C, T>::unsafe_evaluate_multiply_add(const bigfield& input_left,
             modulus,
         };
         // N.B. this method also evaluates the prime field component of the non-native field mul
-        const auto [lo_idx, hi_idx] = ctx->queue_non_native_field_multiplication(witnesses, false);
+        const auto [lo_idx, hi_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
         field_t<C>::evaluate_polynomial_identity(left.prime_basis_limb,
@@ -2267,7 +2269,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
         // we set `neg_modulus = [2^{136}, 0, 0, 0]` and `q = [lo_1, 0, hi_1, 0]`, then we will add `lo_1` into
         // `lo`, and `lo_1/2^{136} + hi_1` into `hi`. we can then subtract off `lo_1/2^{136}` from `hi`, by setting
         // `r = [0, 0, lo_1, 0]` This saves us 2 addition gates as we don't have to add together the outputs of two
-        // calls to `queue_non_native_field_multiplication`
+        // calls to `evaluate_non_native_field_multiplication`
         std::vector<field_t<C>> limb_0_accumulator;
         std::vector<field_t<C>> limb_2_accumulator;
         std::vector<field_t<C>> prime_limb_accumulator;
@@ -2320,7 +2322,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
                     modulus,
                 };
 
-                const auto [lo_2_idx, hi_2_idx] = ctx->evaluate_partial_non_native_field_multiplication(mul_witnesses);
+                const auto [lo_2_idx, hi_2_idx] = ctx->queue_partial_non_native_field_multiplication(mul_witnesses);
 
                 field_t<C> lo_2 = field_t<C>::from_witness_index(ctx, lo_2_idx);
                 field_t<C> hi_2 = field_t<C>::from_witness_index(ctx, hi_2_idx);
@@ -2416,7 +2418,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
             modulus,
         };
 
-        const auto [lo_1_idx, hi_1_idx] = ctx->queue_non_native_field_multiplication(witnesses, false);
+        const auto [lo_1_idx, hi_1_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
 
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
index 29d8e472c14c..4330c5b6b455 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
@@ -50,15 +50,15 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
     void validate_on_curve() const
     {
-        Fq xx = x.sqr();
-        Fq rhs = y.sqr();
         Fq b(get_context(), uint256_t(NativeGroup::curve_b));
-        Fq lhs = xx.madd(x, { b });
-        if constexpr (NativeGroup::has_a) {
+        if constexpr (!NativeGroup::has_a) {
+            // we validate y^2 = x^3 + b by setting "fix_remainder_zero = true" when calling mult_madd
+            Fq::mult_madd({ x.sqr(), y }, { x, -y }, { b }, true);
+        } else {
             Fq a(get_context(), uint256_t(NativeGroup::curve_a));
-            lhs = lhs + (a * x);
+            // we validate y^2 = x^3 + ax + b by setting "fix_remainder_zero = true" when calling mult_madd
+            Fq::mult_madd({ x.sqr(), x, y }, { -x, a, y }, { b }, true);
         }
-        lhs.assert_equal(rhs);
     }
 
     static element one(Composer* ctx)
@@ -99,6 +99,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         *this = *this - other;
         return *this;
     }
+    std::array<element, 2> add_sub(const element& other) const;
 
     element operator*(const Fr& other) const;
 
@@ -139,7 +140,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         bool is_element = false;
 
         chain_add_accumulator(){};
-        explicit chain_add_accumulator(element& input)
+        explicit chain_add_accumulator(const element& input)
         {
             x3_prev = input.x;
             y3_prev = input.y;
@@ -161,10 +162,8 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
     element montgomery_ladder(const element& other) const;
     element montgomery_ladder(const chain_add_accumulator& accumulator);
-    element double_montgomery_ladder(const element& add1, const element& add2) const;
-    element double_montgomery_ladder(const chain_add_accumulator& add1, const element& add2) const;
-    element double_montgomery_ladder(const chain_add_accumulator& add1, const chain_add_accumulator& add2) const;
-    element double_into_montgomery_ladder(const element& to_add) const;
+    element multiple_montgomery_ladder(const std::vector<chain_add_accumulator>& to_add) const;
+    element quadruple_and_add(const std::vector<element>& to_add) const;
 
     typename NativeGroup::affine_element get_value() const
     {
@@ -256,12 +255,13 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
     template <size_t num_elements,
               typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
     static std::array<twin_rom_table<Composer>, 5> create_group_element_rom_tables(
-        const std::array<element, num_elements>& elements);
+        const std::array<element, num_elements>& elements, std::array<uint256_t, 8>& limb_max);
 
     template <size_t num_elements,
               typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
     static element read_group_element_rom_tables(const std::array<twin_rom_table<Composer>, 5>& tables,
-                                                 const field_t<Composer>& index);
+                                                 const field_t<Composer>& index,
+                                                 const std::array<uint256_t, 8>& limb_max);
 
     static std::pair<element, element> compute_offset_generators(const size_t num_rounds);
 
@@ -277,6 +277,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element operator[](const size_t idx) const { return element_table[idx]; }
         std::array<element, 16> element_table;
         std::array<twin_rom_table<Composer>, 5> coordinates;
+        std::array<uint256_t, 8> limb_max; // tracks the maximum limb size represented in each element_table entry
     };
 
     template <typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
@@ -310,7 +311,6 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             P1.element_table[i] = P1.element_table[i - 1] + d2;
         }
         for (size_t i = 0; i < 8; ++i) {
-            // TODO: DO WE NEED TO REDUCE THESE ELEMENTS????
             P1.element_table[i] = (-P1.element_table[15 - i]);
         }
         for (size_t i = 0; i < 16; ++i) {
@@ -322,8 +322,8 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             endoP1.element_table[i].x = P1.element_table[i].x * beta;
             endoP1.element_table[15 - i].x = endoP1.element_table[i].x;
         }
-        P1.coordinates = create_group_element_rom_tables<16>(P1.element_table);
-        endoP1.coordinates = create_group_element_rom_tables<16>(endoP1.element_table);
+        P1.coordinates = create_group_element_rom_tables<16>(P1.element_table, P1.limb_max);
+        endoP1.coordinates = create_group_element_rom_tables<16>(endoP1.element_table, endoP1.limb_max);
         auto result = std::make_pair<four_bit_table_plookup<>, four_bit_table_plookup<>>(
             (four_bit_table_plookup<>)P1, (four_bit_table_plookup<>)endoP1);
         return result;
@@ -391,6 +391,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
         std::array<element, table_size> element_table;
         std::array<twin_rom_table<Composer>, 5> coordinates;
+        std::array<uint256_t, 8> limb_max;
     };
 
     using twin_lookup_table = typename std::
@@ -418,10 +419,10 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
                 endo_table.element_table[i + 8].x = base_table[7 - i].x * beta;
                 endo_table.element_table[i + 8].y = base_table[7 - i].y;
 
-                endo_table.element_table[7 - i] = (-endo_table.element_table[i + 8]).reduce();
+                endo_table.element_table[7 - i] = (-endo_table.element_table[i + 8]);
             }
 
-            endo_table.coordinates = create_group_element_rom_tables<16>(endo_table.element_table);
+            endo_table.coordinates = create_group_element_rom_tables<16>(endo_table.element_table, endo_table.limb_max);
         } else {
             std::array<element, 4> endo_inputs(inputs);
             for (auto& input : endo_inputs) {
@@ -451,10 +452,10 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
                 endo_table.element_table[i + 16].x = base_table[15 - i].x * beta;
                 endo_table.element_table[i + 16].y = base_table[15 - i].y;
 
-                endo_table.element_table[15 - i] = (-endo_table.element_table[i + 16]).reduce();
+                endo_table.element_table[15 - i] = (-endo_table.element_table[i + 16]);
             }
 
-            endo_table.coordinates = create_group_element_rom_tables<32>(endo_table.element_table);
+            endo_table.coordinates = create_group_element_rom_tables<32>(endo_table.element_table, endo_table.limb_max);
         }
         return std::make_pair<lookup_table_plookup<5>, lookup_table_plookup<5>>((lookup_table_plookup<5>)base_table,
                                                                                 (lookup_table_plookup<5>)endo_table);
@@ -472,11 +473,16 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             num_points = points.size();
             num_fives = num_points / 5;
 
+            // size-6 table is expensive and only benefits us if creating them reduces the number of total tables
             if (num_fives * 5 == (num_points - 1)) {
                 num_fives -= 1;
                 num_sixes = 1;
-            } else {
-                num_sixes = 0;
+            } else if (num_fives * 5 == (num_points - 2) && num_fives >= 2) {
+                num_fives -= 2;
+                num_sixes = 2;
+            } else if (num_fives * 5 == (num_points - 3) && num_fives >= 3) {
+                num_fives -= 3;
+                num_sixes = 3;
             }
 
             has_quad = ((num_fives * 5 + num_sixes * 6) < num_points - 3) && (num_points >= 4);
@@ -490,33 +496,40 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             has_singleton = num_points != ((num_fives * 5 + num_sixes * 6) + ((size_t)has_quad * 4) +
                                            ((size_t)has_triple * 3) + ((size_t)has_twin * 2));
 
+            size_t offset = 0;
+            for (size_t i = 0; i < num_sixes; ++i) {
+                six_tables.push_back(lookup_table_plookup<6>({
+                    points[offset + 6 * i],
+                    points[offset + 6 * i + 1],
+                    points[offset + 6 * i + 2],
+                    points[offset + 6 * i + 3],
+                    points[offset + 6 * i + 4],
+                    points[offset + 6 * i + 5],
+                }));
+            }
+            offset += 6 * num_sixes;
             for (size_t i = 0; i < num_fives; ++i) {
-                five_tables.push_back(lookup_table_plookup<5>(
-                    { points[5 * i], points[5 * i + 1], points[5 * i + 2], points[5 * i + 3], points[5 * i + 4] }));
-            }
-
-            if (num_sixes == 1) {
-                six_tables.push_back(lookup_table_plookup<6>({ points[5 * num_fives],
-                                                               points[5 * num_fives + 1],
-                                                               points[5 * num_fives + 2],
-                                                               points[5 * num_fives + 3],
-                                                               points[5 * num_fives + 4],
-                                                               points[5 * num_fives + 5] }));
+                five_tables.push_back(lookup_table_plookup<5>({
+                    points[offset + 5 * i],
+                    points[offset + 5 * i + 1],
+                    points[offset + 5 * i + 2],
+                    points[offset + 5 * i + 3],
+                    points[offset + 5 * i + 4],
+                }));
             }
+            offset += 5 * num_fives;
 
             if (has_quad) {
-                quad_tables.push_back(quad_lookup_table({ points[5 * num_fives],
-                                                          points[5 * num_fives + 1],
-                                                          points[5 * num_fives + 2],
-                                                          points[5 * num_fives + 3] }));
+                quad_tables.push_back(
+                    quad_lookup_table({ points[offset], points[offset + 1], points[offset + 2], points[offset + 3] }));
             }
 
             if (has_triple) {
-                triple_tables.push_back(triple_lookup_table(
-                    { points[5 * num_fives], points[5 * num_fives + 1], points[5 * num_fives + 2] }));
+                triple_tables.push_back(
+                    triple_lookup_table({ points[offset], points[offset + 1], points[offset + 2] }));
             }
             if (has_twin) {
-                twin_tables.push_back(twin_lookup_table({ points[5 * num_fives], points[5 * num_fives + 1] }));
+                twin_tables.push_back(twin_lookup_table({ points[offset], points[offset + 1] }));
             }
 
             if (has_singleton) {
@@ -587,37 +600,36 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element::chain_add_accumulator get_chain_add_accumulator(std::vector<bool_t<Composer>>& naf_entries) const
         {
             std::vector<element> round_accumulator;
+            for (size_t j = 0; j < num_sixes; ++j) {
+                round_accumulator.push_back(six_tables[j].get({ naf_entries[6 * j],
+                                                                naf_entries[6 * j + 1],
+                                                                naf_entries[6 * j + 2],
+                                                                naf_entries[6 * j + 3],
+                                                                naf_entries[6 * j + 4],
+                                                                naf_entries[6 * j + 5] }));
+            }
+            size_t offset = num_sixes * 6;
             for (size_t j = 0; j < num_fives; ++j) {
-                round_accumulator.push_back(five_tables[j].get({ naf_entries[5 * j],
-                                                                 naf_entries[5 * j + 1],
-                                                                 naf_entries[5 * j + 2],
-                                                                 naf_entries[5 * j + 3],
-                                                                 naf_entries[5 * j + 4] }));
-            }
-
-            if (num_sixes == 1) {
-                round_accumulator.push_back(six_tables[0].get({ naf_entries[num_fives * 5],
-                                                                naf_entries[num_fives * 5 + 1],
-                                                                naf_entries[num_fives * 5 + 2],
-                                                                naf_entries[num_fives * 5 + 3],
-                                                                naf_entries[num_fives * 5 + 4],
-                                                                naf_entries[num_fives * 5 + 5] }));
+                round_accumulator.push_back(five_tables[j].get({ naf_entries[offset + j * 5],
+                                                                 naf_entries[offset + j * 5 + 1],
+                                                                 naf_entries[offset + j * 5 + 2],
+                                                                 naf_entries[offset + j * 5 + 3],
+                                                                 naf_entries[offset + j * 5 + 4] }));
             }
-
+            offset += num_fives * 5;
             if (has_quad) {
-                round_accumulator.push_back(quad_tables[0].get({ naf_entries[num_fives * 5],
-                                                                 naf_entries[num_fives * 5 + 1],
-                                                                 naf_entries[num_fives * 5 + 2],
-                                                                 naf_entries[num_fives * 5 + 3] }));
+                round_accumulator.push_back(quad_tables[0].get({ naf_entries[offset],
+                                                                 naf_entries[offset + 1],
+                                                                 naf_entries[offset + 2],
+                                                                 naf_entries[offset + 3] }));
             }
 
             if (has_triple) {
-                round_accumulator.push_back(triple_tables[0].get(
-                    { naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1], naf_entries[num_fives * 5 + 2] }));
+                round_accumulator.push_back(
+                    triple_tables[0].get({ naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2] }));
             }
             if (has_twin) {
-                round_accumulator.push_back(
-                    twin_tables[0].get({ naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1] }));
+                round_accumulator.push_back(twin_tables[0].get({ naf_entries[offset], naf_entries[offset + 1] }));
             }
             if (has_singleton) {
                 round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
@@ -640,37 +652,37 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element get(std::vector<bool_t<Composer>>& naf_entries) const
         {
             std::vector<element> round_accumulator;
-            for (size_t j = 0; j < num_fives; ++j) {
-                round_accumulator.push_back(five_tables[j].get({ naf_entries[5 * j],
-                                                                 naf_entries[5 * j + 1],
-                                                                 naf_entries[5 * j + 2],
-                                                                 naf_entries[5 * j + 3],
-                                                                 naf_entries[5 * j + 4] }));
+            for (size_t j = 0; j < num_sixes; ++j) {
+                round_accumulator.push_back(six_tables[j].get({ naf_entries[6 * j],
+                                                                naf_entries[6 * j + 1],
+                                                                naf_entries[6 * j + 2],
+                                                                naf_entries[6 * j + 3],
+                                                                naf_entries[6 * j + 4],
+                                                                naf_entries[6 * j + 5] }));
             }
+            size_t offset = num_sixes * 6;
 
-            if (num_sixes == 1) {
-                round_accumulator.push_back(six_tables[0].get({ naf_entries[num_fives * 5],
-                                                                naf_entries[num_fives * 5 + 1],
-                                                                naf_entries[num_fives * 5 + 2],
-                                                                naf_entries[num_fives * 5 + 3],
-                                                                naf_entries[num_fives * 5 + 4],
-                                                                naf_entries[num_fives * 5 + 5] }));
+            for (size_t j = 0; j < num_fives; ++j) {
+                round_accumulator.push_back(five_tables[j].get({ naf_entries[offset + 5 * j],
+                                                                 naf_entries[offset + 5 * j + 1],
+                                                                 naf_entries[offset + 5 * j + 2],
+                                                                 naf_entries[offset + 5 * j + 3],
+                                                                 naf_entries[offset + 5 * j + 4] }));
             }
 
+            offset += num_fives * 5;
+
             if (has_quad) {
-                round_accumulator.push_back(quad_tables[0].get(naf_entries[num_fives * 5],
-                                                               naf_entries[num_fives * 5 + 1],
-                                                               naf_entries[num_fives * 5 + 2],
-                                                               naf_entries[num_fives * 5 + 3]));
+                round_accumulator.push_back(quad_tables[0].get(
+                    naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2], naf_entries[offset + 3]));
             }
 
             if (has_triple) {
-                round_accumulator.push_back(triple_tables[0].get(
-                    naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1], naf_entries[num_fives * 5 + 2]));
+                round_accumulator.push_back(
+                    triple_tables[0].get(naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2]));
             }
             if (has_twin) {
-                round_accumulator.push_back(
-                    twin_tables[0].get(naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1]));
+                round_accumulator.push_back(twin_tables[0].get(naf_entries[offset], naf_entries[offset + 1]));
             }
             if (has_singleton) {
                 round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
@@ -862,67 +874,6 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             return element::chain_add_end(accumulator);
         }
 
-        // chain_add_accumulator get_chain_initial_entry() const
-        // {
-        //     std::vector<element> add_accumulator;
-        //     for (size_t i = 0; i < num_quads; ++i) {
-        //         add_accumulator.push_back(quad_tables[i][0]);
-        //     }
-        //     if (has_twin) {
-        //         add_accumulator.push_back(twin_tables[0][0]);
-        //     }
-        //     if (has_triple) {
-        //         add_accumulator.push_back(triple_tables[0][0]);
-        //     }
-        //     if (has_singleton) {
-        //         add_accumulator.push_back(singletons[0]);
-        //     }
-        //     if (add_accumulator.size() >= 2) {
-        //         chain_add_accumulator output = element::chain_add_start(add_accumulator[0], add_accumulator[1]);
-        //         for (size_t i = 2; i < add_accumulator.size(); ++i) {
-        //             output = element::chain_add(add_accumulator[i], output);
-        //         }
-        //         return output;
-        //     }
-        //     return chain_add_accumulator(add_accumulator[0]);
-        // }
-
-        // element::chain_add_accumulator get_chain_add_accumulator(std::vector<bool_t<Composer>>& naf_entries) const
-        // {
-        //     std::vector<element> round_accumulator;
-        //     for (size_t j = 0; j < num_quads; ++j) {
-        //         round_accumulator.push_back(quad_tables[j].get(
-        //             naf_entries[4 * j], naf_entries[4 * j + 1], naf_entries[4 * j + 2], naf_entries[4 * j + 3]));
-        //     }
-
-        //     if (has_triple) {
-        //         round_accumulator.push_back(triple_tables[0].get(
-        //             naf_entries[num_quads * 4], naf_entries[num_quads * 4 + 1], naf_entries[num_quads * 4 + 2]));
-        //     }
-        //     if (has_twin) {
-        //         round_accumulator.push_back(
-        //             twin_tables[0].get(naf_entries[num_quads * 4], naf_entries[num_quads * 4 + 1]));
-        //     }
-        //     if (has_singleton) {
-        //         round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
-        //     }
-
-        //     element::chain_add_accumulator accumulator;
-        //     if (round_accumulator.size() == 1) {
-        //         accumulator.x3_prev = round_accumulator[0].x;
-        //         accumulator.y3_prev = round_accumulator[0].y;
-        //         accumulator.is_element = true;
-        //         return accumulator;
-        //     } else if (round_accumulator.size() == 2) {
-        //         return element::chain_add_start(round_accumulator[0], round_accumulator[1]);
-        //     } else {
-        //         accumulator = element::chain_add_start(round_accumulator[0], round_accumulator[1]);
-        //         for (size_t j = 2; j < round_accumulator.size(); ++j) {
-        //             accumulator = element::chain_add(round_accumulator[j], accumulator);
-        //         }
-        //     }
-        //     return (accumulator);
-        // }
         std::vector<quad_lookup_table> quad_tables;
         std::vector<triple_lookup_table> triple_tables;
         std::vector<twin_lookup_table> twin_tables;
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
index e3b6576e9fc0..85b4e2325cab 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
@@ -415,7 +415,7 @@ template <typename TestType> class stdlib_biggroup : public testing::Test {
         EXPECT_VERIFICATION(composer);
     }
 
-    static void test_double_montgomery_ladder()
+    static void test_multiple_montgomery_ladder()
     {
         Composer composer = Composer();
         size_t num_repetitions = 10;
@@ -423,19 +423,17 @@ template <typename TestType> class stdlib_biggroup : public testing::Test {
             affine_element acc_small(element::random_element());
             element_ct acc_big = element_ct::from_witness(&composer, acc_small);
 
-            affine_element add_1_small_0(element::random_element());
-            element_ct add_1_big_0 = element_ct::from_witness(&composer, add_1_small_0);
-            affine_element add_2_small_0(element::random_element());
-            element_ct add_2_big_0 = element_ct::from_witness(&composer, add_2_small_0);
-
-            affine_element add_1_small_1(element::random_element());
-            element_ct add_1_big_1 = element_ct::from_witness(&composer, add_1_small_1);
-            affine_element add_2_small_1(element::random_element());
-            element_ct add_2_big_1 = element_ct::from_witness(&composer, add_2_small_1);
-
-            typename element_ct::chain_add_accumulator add_1 = element_ct::chain_add_start(add_1_big_0, add_1_big_1);
-            typename element_ct::chain_add_accumulator add_2 = element_ct::chain_add_start(add_2_big_0, add_2_big_1);
-            acc_big.double_montgomery_ladder(add_1, add_2);
+            std::vector<typename element_ct::chain_add_accumulator> to_add;
+            for (size_t j = 0; j < i; ++j) {
+                affine_element add_1_small_0(element::random_element());
+                element_ct add_1_big_0 = element_ct::from_witness(&composer, add_1_small_0);
+                affine_element add_2_small_0(element::random_element());
+                element_ct add_2_big_0 = element_ct::from_witness(&composer, add_2_small_0);
+                typename element_ct::chain_add_accumulator add_1 =
+                    element_ct::chain_add_start(add_1_big_0, add_2_big_0);
+                to_add.emplace_back(add_1);
+            }
+            acc_big.multiple_montgomery_ladder(to_add);
         }
 
         EXPECT_VERIFICATION(composer);
@@ -890,10 +888,10 @@ HEAVY_TYPED_TEST(stdlib_biggroup, chain_add)
 
     TestFixture::test_chain_add();
 }
-HEAVY_TYPED_TEST(stdlib_biggroup, double_montgomery_ladder)
+HEAVY_TYPED_TEST(stdlib_biggroup, multiple_montgomery_ladder)
 {
 
-    TestFixture::test_double_montgomery_ladder();
+    TestFixture::test_multiple_montgomery_ladder();
 }
 
 HEAVY_TYPED_TEST(stdlib_biggroup, compute_naf)
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
index c237d0dfbb72..ebeb0aee5b1f 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
@@ -41,14 +41,11 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::wnaf_batch_mul(const std::vector<el
     for (size_t i = 1; i < num_rounds; ++i) {
         accumulator = accumulator.dbl();
         accumulator = accumulator.dbl();
-
-        element to_add = point_tables[0][wnaf_entries[0][i]];
-        for (size_t j = 1; j < points.size(); ++j) {
-            to_add += point_tables[j][wnaf_entries[j][i]];
+        std::vector<element> to_add;
+        for (size_t j = 0; j < points.size(); ++j) {
+            to_add.emplace_back(point_tables[j][wnaf_entries[j][i]]);
         }
-        // accumulator = accumulator.dbl();
-        // accumulator = accumulator.montgomery_ladder(to_add);
-        accumulator = accumulator.double_into_montgomery_ladder(to_add);
+        accumulator = accumulator.quadruple_and_add(to_add);
     }
 
     for (size_t i = 0; i < points.size(); ++i) {
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
index 834a1b01ef19..924cd0d7ecdd 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
@@ -117,34 +117,45 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul_with_generator
             return to_add;
         };
 
-        for (size_t i = 1; i < num_rounds / 2; ++i) {
+        // Perform multiple rounds of the montgomery ladder algoritm per "iteration" of our main loop.
+        // This is in order to reduce the number of field reductions required when calling `multiple_montgomery_ladder`
+        constexpr size_t num_rounds_per_iteration = 4;
 
-            auto add_1 = get_point_to_add(i * 2 - 1);
-            auto add_2 = get_point_to_add(i * 2);
+        // we require that we perform max of one generator per iteration
+        static_assert(num_rounds_per_iteration < 8);
 
-            // TODO update this to work if num_bits is odd
-            if ((i * 2) % 8 == 0) {
-                add_1 = element::chain_add(generator_table[generator_wnaf[(i * 2 - 8) / 8]], add_1);
-                add_1 = element::chain_add(generator_endo_table[generator_endo_wnaf[(i * 2 - 8) / 8]], add_1);
-            }
-            if (!add_1.is_element) {
-                accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-            } else {
-                accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                                   element(add_2.x3_prev, add_2.y3_prev));
+        size_t num_iterations = num_rounds / num_rounds_per_iteration;
+        num_iterations += ((num_iterations * num_rounds_per_iteration) == num_rounds) ? 0 : 1;
+        const size_t num_rounds_per_final_iteration =
+            (num_rounds - 1) - ((num_iterations - 1) * num_rounds_per_iteration);
+
+        size_t generator_idx = 0;
+        for (size_t i = 0; i < num_iterations; ++i) {
+
+            const size_t inner_num_rounds =
+                (i != num_iterations - 1) ? num_rounds_per_iteration : num_rounds_per_final_iteration;
+            std::vector<element::chain_add_accumulator> to_add;
+
+            for (size_t j = 0; j < inner_num_rounds; ++j) {
+                to_add.emplace_back(get_point_to_add(i * num_rounds_per_iteration + j + 1));
             }
-        }
 
-        if ((num_rounds & 0x01ULL) == 0x00ULL) {
-            auto add_1 = get_point_to_add(num_rounds - 1);
-            add_1 = element::chain_add(generator_table[generator_wnaf[generator_wnaf.size() - 2]], add_1);
-            add_1 = element::chain_add(generator_endo_table[generator_endo_wnaf[generator_wnaf.size() - 2]], add_1);
-            if (add_1.is_element) {
-                element temp(add_1.x3_prev, add_1.y3_prev);
-                accumulator = accumulator.montgomery_ladder(temp);
-            } else {
-                accumulator = accumulator.montgomery_ladder(add_1);
+            bool add_generator_this_round = false;
+            size_t add_idx = 0;
+            for (size_t j = 0; j < inner_num_rounds; ++j) {
+                add_generator_this_round = ((i * num_rounds_per_iteration + j) % 8) == 6;
+                if (add_generator_this_round) {
+                    add_idx = j;
+                    break;
+                }
+            }
+            if (add_generator_this_round) {
+                to_add[add_idx] = element::chain_add(generator_table[generator_wnaf[generator_idx]], to_add[add_idx]);
+                to_add[add_idx] =
+                    element::chain_add(generator_endo_table[generator_endo_wnaf[generator_idx]], to_add[add_idx]);
+                generator_idx++;
             }
+            accumulator = accumulator.multiple_montgomery_ladder(to_add);
         }
 
         for (size_t i = 0; i < small_points.size(); ++i) {
@@ -333,12 +344,12 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
      * 1. Extract NAF value for bit `2*i - 1` for each scalar multiplier and store in `nafs` vector.
      * 2. Use `nafs` vector to derive the point that we need (`add_1`) to add into our accumulator.
      * 3. Repeat the above 2 steps but for bit `2 * i` (`add_2`)
-     * 4. Compute `accumulator = 4 * accumulator + 2 * add_1 + add_2` using `double_montgomery_ladder` method
+     * 4. Compute `accumulator = 4 * accumulator + 2 * add_1 + add_2` using `multiple_montgomery_ladder` method
      *
      * The purpose of the above is to minimize the number of required range checks (vs a simple double and add algo).
      *
-     * When computing two iterations of the montgomery ladder algorithm, we can neglect computing the y-coordinate of
-     *the 1st ladder output. See `double_montgomery_ladder` for more details.
+     * When computing repeated iterations of the montgomery ladder algorithm, we can neglect computing the y-coordinate
+     *of each ladder output. See `multiple_montgomery_ladder` for more details.
      **/
     for (size_t i = 1; i < num_rounds / 2; ++i) {
         // `nafs` tracks the naf value for each point for the current round
@@ -365,14 +376,8 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
         }
         element::chain_add_accumulator add_2 = point_table.get_chain_add_accumulator(nafs);
 
-        // Perform the double montgomery ladder. We need to convert our chain_add_accumulator types into regular
-        // elements if the accumuator does not contain a y-coordinate
-        if (!add_1.is_element) {
-            accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-        } else {
-            accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                               element(add_2.x3_prev, add_2.y3_prev));
-        }
+        // Perform the double montgomery ladder.
+        accumulator = accumulator.multiple_montgomery_ladder({ add_1, add_2 });
     }
 
     // we need to iterate 1 more time if the number of rounds is even
@@ -382,12 +387,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
             nafs.emplace_back(naf_entries[j][num_rounds - 1]);
         }
         element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        if (add_1.is_element) {
-            element temp(add_1.x3_prev, add_1.y3_prev);
-            accumulator = accumulator.montgomery_ladder(temp);
-        } else {
-            accumulator = accumulator.montgomery_ladder(add_1);
-        }
+        accumulator = accumulator.multiple_montgomery_ladder({ add_1 });
     }
 
     /**
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
index 78d6f378554a..bff1805db3ca 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
@@ -80,6 +80,39 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::operator-(const element& other) con
 
     return element(x_3, y_3);
 }
+
+/**
+ * @brief Compute (*this) + other AND (*this) - other as a size-2 array
+ *
+ * @details We require this operation when computing biggroup lookup tables for
+ *          multi-scalar-multiplication. This combined method reduces the number of
+ *          field additions, field subtractions required (as well as 1 less assert_is_not_equal check)
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param other
+ * @return std::array<element<C, Fq, Fr, G>, 2>
+ */
+template <typename C, class Fq, class Fr, class G>
+std::array<element<C, Fq, Fr, G>, 2> element<C, Fq, Fr, G>::add_sub(const element& other) const
+{
+    other.x.assert_is_not_equal(x);
+
+    const Fq denominator = other.x - x;
+    const Fq x2x1 = -(other.x + x);
+
+    const Fq lambda1 = Fq::div_without_denominator_check({ other.y, -y }, denominator);
+    const Fq x_3 = lambda1.sqradd({ x2x1 });
+    const Fq y_3 = lambda1.madd(x - x_3, { -y });
+    const Fq lambda2 = Fq::div_without_denominator_check({ -other.y, -y }, denominator);
+    const Fq x_4 = lambda2.sqradd({ x2x1 });
+    const Fq y_4 = lambda2.madd(x - x_4, { -y });
+
+    return { element(x_3, y_3), element(x_4, y_4) };
+}
+
 template <typename C, class Fq, class Fr, class G> element<C, Fq, Fr, G> element<C, Fq, Fr, G>::dbl() const
 {
     Fq two_x = x + x;
@@ -294,200 +327,217 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::montgomery_ladder(const chain_add_a
 }
 
 /**
- * Compute (4 * (*this)) + (2 * add1) + add2
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction.
+ * @brief Compute 4.P + to_add[0] + ... + to_add[to_add.size() - 1]
  *
- * Total number of field reductions = 9
+ * @details Used in wnaf_batch_mul method. Combining operations requires fewer bigfield reductions.
  *
- * Two calls to mont ladder woud require 10
+ * Method computes R[i] = (2P + A[0]) + (2P + A[1]) + A[2] + ... + A[n-1]
  *
- * Using doublings and additions would require 12!
- **/
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param to_add
+ * @return element<C, Fq, Fr, G>
+ */
 template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const element& add1, const element& add2) const
+element<C, Fq, Fr, G> element<C, Fq, Fr, G>::quadruple_and_add(const std::vector<element>& to_add) const
 {
-    add1.x.assert_is_not_equal(x);
-    const Fq lambda_1 = Fq::div_without_denominator_check({ add1.y, -y }, (add1.x - x));
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder.
-    // Defining the quotient and remainder elements is the major cost of a non-native field multiplication
-    // because each requires ~256 bits of range checks
-    const Fq x_sub_x4 = x - x_4;
-
-    const Fq x4_sub_add2x = x_4 - add2.x;
+    const Fq two_x = x + x;
+    Fq x_1;
+    Fq minus_lambda_dbl;
+    if constexpr (G::has_a) {
+        Fq a(get_context(), uint256_t(G::curve_a));
+        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), { a });
+        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+    } else {
+        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), {});
+        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+    }
 
-    // msub_div; 'compute a multiplication and a division and multiply the two together. Requires only 1 non native
-    // field reduction`
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2 }, { (x_sub_x4) }, (x4_sub_add2x), { y, add2.y });
+    ASSERT(to_add.size() > 0);
+    to_add[0].x.assert_is_not_equal(x_1);
 
-    // validate we can use incomplete addition formulae
-    x_4.assert_is_not_equal(add2.x);
+    const Fq x_minus_x_1 = x - x_1;
 
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x });
-    const Fq x5_sub_x4 = x_5 - x_4;
+    const Fq lambda_1 = Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_1 - to_add[0].x), { to_add[0].y, y });
 
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    const Fq x_3 = lambda_1.sqradd({ -to_add[0].x, -x_1 });
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+    const Fq half_minus_lambda_2_minus_lambda_1 =
+        Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_3 - x_1), { y });
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+    const Fq minus_lambda_2_minus_lambda_1 = half_minus_lambda_2_minus_lambda_1 + half_minus_lambda_2_minus_lambda_1;
+    const Fq minus_lambda_2 = minus_lambda_2_minus_lambda_1 + lambda_1;
 
-    // y_6 = -L_4 * (x_6 - x_4) - L_2 * (x - x_4) + y
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+    const Fq x_4 = minus_lambda_2.sqradd({ -x_1, -x_3 });
 
-    return element(x_6, y_6);
-}
+    const Fq x_4_sub_x_1 = x_4 - x_1;
 
-/**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
- *
- **/
-template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const chain_add_accumulator& add1,
-                                                                      const element& add2) const
-{
-    if (add1.is_element) {
-        throw_or_abort("An accumulator expected");
+    if (to_add.size() == 1) {
+        const Fq y_4 = Fq::dual_madd(minus_lambda_2, x_4_sub_x_1, minus_lambda_dbl, x_minus_x_1, { y });
+        return element(x_4, y_4);
     }
-    add1.x3_prev.assert_is_not_equal(x);
-    Fq lambda_1 = Fq::msub_div(
-        { add1.lambda_prev }, { (add1.x1_prev - add1.x3_prev) }, (x - add1.x3_prev), { -add1.y1_prev, -y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x3_prev, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder, which is the major cost
-    // of a non-native field multiplication
-    const Fq x_sub_x4 = x - x_4;
+    to_add[1].x.assert_is_not_equal(to_add[0].x);
 
-    const Fq x4_sub_add2x = x_4 - add2.x;
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2 }, { (x_sub_x4) }, (x4_sub_add2x), { y, add2.y });
+    Fq minus_lambda_3 = Fq::msub_div(
+        { minus_lambda_dbl, minus_lambda_2 }, { x_minus_x_1, x_4_sub_x_1 }, (x_4 - to_add[1].x), { y, -(to_add[1].y) });
 
-    x_4.assert_is_not_equal(add2.x);
+    // X5 = L3.L3 - X4 - XB
+    const Fq x_5 = minus_lambda_3.sqradd({ -x_4, -to_add[1].x });
 
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x });
-    const Fq x5_sub_x4 = x_5 - x_4;
+    if (to_add.size() == 2) {
+        // Y5 = L3.(XB - X5) - YB
+        const Fq y_5 = minus_lambda_3.madd(x_5 - to_add[1].x, { -to_add[1].y });
+        return element(x_5, y_5);
+    }
 
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    Fq x_prev = x_5;
+    Fq minus_lambda_prev = minus_lambda_3;
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+    for (size_t i = 2; i < to_add.size(); ++i) {
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+        to_add[i].x.assert_is_not_equal(to_add[i - 1].x);
+        // Lambda = Yprev - Yadd[i] / Xprev - Xadd[i]
+        //        = -Lprev.(Xprev - Xadd[i-1]) - Yadd[i - 1] - Yadd[i] / Xprev - Xadd[i]
+        const Fq minus_lambda = Fq::msub_div({ minus_lambda_prev },
+                                             { to_add[i - 1].x - x_prev },
+                                             (to_add[i].x - x_prev),
+                                             { to_add[i - 1].y, to_add[i].y });
+        // X = Lambda * Lambda - Xprev - Xadd[i]
+        const Fq x_out = minus_lambda.sqradd({ -x_prev, -to_add[i].x });
 
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+        x_prev = x_out;
+        minus_lambda_prev = minus_lambda;
+    }
+    const Fq y_out = minus_lambda_prev.madd(x_prev - to_add[to_add.size() - 1].x, { -to_add[to_add.size() - 1].y });
 
-    return element(x_6, y_6);
+    return element(x_prev, y_out);
 }
 
 /**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
+ * @brief Perform repeated iterations of the montgomery ladder algorithm.
  *
- **/
+ * For points P, Q, montgomery ladder computes R = (P + Q) + P
+ * i.e. it's "double-and-add" without explicit doublings.
+ *
+ * This method can apply repeated iterations of the montgomery ladder.
+ * Each iteration reduces the number of field multiplications by 1, at the cost of more additions.
+ * (i.e. we don't compute intermediate y-coordinates).
+ *
+ * The number of additions scales with the size of the input vector. The optimal input size appears to be 4.
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param add
+ * @return element<C, Fq, Fr, G>
+ */
 template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const chain_add_accumulator& add1,
-                                                                      const chain_add_accumulator& add2) const
+element<C, Fq, Fr, G> element<C, Fq, Fr, G>::multiple_montgomery_ladder(
+    const std::vector<chain_add_accumulator>& add) const
 {
-    if ((add1.is_element) || (add2.is_element)) {
-        throw_or_abort("An accumulator expected");
-    }
-    add1.x3_prev.assert_is_not_equal(x);
-    // add1.y = lambda_prev * (x1_prev - x3_prev) - y1_prev
-    Fq lambda_1 = Fq::msub_div(
-        { add1.lambda_prev }, { (add1.x1_prev - add1.x3_prev) }, (x - add1.x3_prev), { -add1.y1_prev, -y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x3_prev, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder, which is the major cost
-    // of a non-native field multiplication
-    const Fq x_sub_x4 = x - x_4;
-
-    const Fq x4_sub_add2x = x_4 - add2.x3_prev;
-
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2, add2.lambda_prev },
-                                     { (x_sub_x4), (add2.x1_prev - add2.x3_prev) },
-                                     (x4_sub_add2x),
-                                     { y, -add2.y1_prev });
-
-    x_4.assert_is_not_equal(add2.x3_prev);
-
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x3_prev });
-    const Fq x5_sub_x4 = x_5 - x_4;
-
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    struct composite_y {
+        std::vector<Fq> mul_left;
+        std::vector<Fq> mul_right;
+        std::vector<Fq> add;
+        bool is_negative = false;
+    };
+
+    Fq previous_x = x;
+    composite_y previous_y{ std::vector<Fq>(), std::vector<Fq>(), std::vector<Fq>(), false };
+    for (size_t i = 0; i < add.size(); ++i) {
+        previous_x.assert_is_not_equal(add[i].x3_prev);
+
+        // composite_y add_y;
+        bool negate_add_y = (i > 0) && !previous_y.is_negative;
+        std::vector<Fq> lambda1_left;
+        std::vector<Fq> lambda1_right;
+        std::vector<Fq> lambda1_add;
+
+        if (i == 0) {
+            lambda1_add.emplace_back(-y);
+        } else {
+            lambda1_left = previous_y.mul_left;
+            lambda1_right = previous_y.mul_right;
+            lambda1_add = previous_y.add;
+        }
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+        if (!add[i].is_element) {
+            lambda1_left.emplace_back(add[i].lambda_prev);
+            lambda1_right.emplace_back(negate_add_y ? add[i].x3_prev - add[i].x1_prev
+                                                    : add[i].x1_prev - add[i].x3_prev);
+            lambda1_add.emplace_back(negate_add_y ? add[i].y1_prev : -add[i].y1_prev);
+        } else if (i > 0) {
+            lambda1_add.emplace_back(negate_add_y ? -add[i].y3_prev : add[i].y3_prev);
+        }
+        // if previous_y is negated then add stays positive
+        // if previous_y is positive then add stays negated
+        // | add.y is negated | previous_y is negated | output of msub_div is -lambda |
+        // | --- | --- | --- |
+        // | no  | yes | yes |
+        // | yes | no  | no  |
+
+        Fq lambda1;
+        if (!add[i].is_element || i > 0) {
+            bool flip_lambda1_denominator = !negate_add_y;
+            Fq denominator = flip_lambda1_denominator ? previous_x - add[i].x3_prev : add[i].x3_prev - previous_x;
+            lambda1 = Fq::msub_div(lambda1_left, lambda1_right, denominator, lambda1_add);
+        } else {
+            lambda1 = Fq::div_without_denominator_check({ add[i].y3_prev - y }, (add[i].x3_prev - x));
+        }
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+        Fq x_3 = lambda1.madd(lambda1, { -add[i].x3_prev, -previous_x });
 
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+        // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where
+        // needed. This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a
+        // field division) with only one non-native field reduction. E.g. evaluating (a * b) + (c * d) = e mod p only
+        // requires 1 quotient and remainder, which is the major cost of a non-native field multiplication
+        Fq lambda2;
+        if (i == 0) {
+            lambda2 = Fq::div_without_denominator_check({ y + y }, (previous_x - x_3)) - lambda1;
+        } else {
+            Fq l2_denominator = previous_y.is_negative ? previous_x - x_3 : x_3 - previous_x;
+            Fq partial_lambda2 =
+                Fq::msub_div(previous_y.mul_left, previous_y.mul_right, l2_denominator, previous_y.add);
+            partial_lambda2 = partial_lambda2 + partial_lambda2;
+            lambda2 = partial_lambda2 - lambda1;
+        }
 
-    return element(x_6, y_6);
-}
-/**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
- **/
-template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_into_montgomery_ladder(const element& add1) const
-{
-    const Fq two_x = x + x;
-    Fq x_1;
-    Fq minus_lambda_dbl;
-    if constexpr (G::has_a) {
-        Fq a(get_context(), uint256_t(G::curve_a));
-        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), { a });
-        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
-    } else {
-        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), {});
-        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+        Fq x_4 = lambda2.sqradd({ -x_3, -previous_x });
+        composite_y y_4;
+        if (i == 0) {
+            // We want to make sure that at the final iteration, `y_previous.is_negative = false`
+            // Each iteration flips the sign of y_previous.is_negative.
+            // i.e. whether we store y_4 or -y_4 depends on the number of points we have
+            bool num_points_even = ((add.size() & 0x01UL) == 0);
+            y_4.add.emplace_back(num_points_even ? y : -y);
+            y_4.mul_left.emplace_back(lambda2);
+            y_4.mul_right.emplace_back(num_points_even ? x_4 - previous_x : previous_x - x_4);
+            y_4.is_negative = num_points_even;
+        } else {
+            y_4.is_negative = !previous_y.is_negative;
+            y_4.mul_left.emplace_back(lambda2);
+            y_4.mul_right.emplace_back(previous_y.is_negative ? previous_x - x_4 : x_4 - previous_x);
+            // append terms in previous_y to y_4. We want to make sure the terms above are added into the start of y_4.
+            // This is to ensure they are cached correctly when
+            // `composer::evaluate_partial_non_native_field_multiplication` is called.
+            // (the 1st mul_left, mul_right elements will trigger composer::evaluate_non_native_field_multiplication
+            //  when Fq::mult_madd is called - this term cannot be cached so we want to make sure it is unique)
+            std::copy(previous_y.mul_left.begin(), previous_y.mul_left.end(), std::back_inserter(y_4.mul_left));
+            std::copy(previous_y.mul_right.begin(), previous_y.mul_right.end(), std::back_inserter(y_4.mul_right));
+            std::copy(previous_y.add.begin(), previous_y.add.end(), std::back_inserter(y_4.add));
+        }
+        previous_x = x_4;
+        previous_y = y_4;
     }
+    Fq x_out = previous_x;
 
-    add1.x.assert_is_not_equal(x_1);
-
-    const Fq x_minus_x_1 = x - x_1;
-    const Fq lambda_1 = Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_1 - add1.x), { add1.y, y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x, -x_1 });
-    const Fq half_minus_lambda_2_minus_lambda_1 =
-        Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_3 - x_1), { y });
-    const Fq minus_lambda_2_minus_lambda_1 = half_minus_lambda_2_minus_lambda_1 + half_minus_lambda_2_minus_lambda_1;
-    const Fq minus_lambda_2 = minus_lambda_2_minus_lambda_1 + lambda_1;
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x_1, -x_3 });
-
-    const Fq y_4 = Fq::dual_madd(minus_lambda_2, (x_4 - x_1), minus_lambda_dbl, x_minus_x_1, { y });
+    ASSERT(!previous_y.is_negative);
 
-    return element(x_4, y_4);
+    Fq y_out = Fq::mult_madd(previous_y.mul_left, previous_y.mul_right, previous_y.add);
+    return element(x_out, y_out);
 }
 
 /**
@@ -551,6 +601,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::batch_mul(const std::vector<element
                                                        const std::vector<Fr>& scalars,
                                                        const size_t max_num_bits)
 {
+
     const size_t num_points = points.size();
     ASSERT(scalars.size() == num_points);
     batch_lookup_table point_table(points);
@@ -563,38 +614,25 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::batch_mul(const std::vector<element
     const auto offset_generators = compute_offset_generators(num_rounds);
     element accumulator =
         element::chain_add_end(element::chain_add(offset_generators.first, point_table.get_chain_initial_entry()));
-    for (size_t i = 1; i < num_rounds / 2; ++i) {
-        std::vector<bool_t<C>> nafs;
-        for (size_t j = 0; j < num_points; ++j) {
-            nafs.emplace_back(naf_entries[j][i * 2 - 1]);
-        }
-        element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        for (size_t j = 0; j < num_points; ++j) {
-            nafs[j] = (naf_entries[j][i * 2]);
-        }
-        element::chain_add_accumulator add_2 = point_table.get_chain_add_accumulator(nafs);
 
-        if (!add_1.is_element) {
-            accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-        } else {
-            accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                               element(add_2.x3_prev, add_2.y3_prev));
+    constexpr size_t num_rounds_per_iteration = 4;
+    size_t num_iterations = num_rounds / num_rounds_per_iteration;
+    num_iterations += ((num_iterations * num_rounds_per_iteration) == num_rounds) ? 0 : 1;
+    const size_t num_rounds_per_final_iteration = (num_rounds - 1) - ((num_iterations - 1) * num_rounds_per_iteration);
+    for (size_t i = 0; i < num_iterations; ++i) {
+
+        std::vector<bool_t<C>> nafs(num_points);
+        std::vector<element::chain_add_accumulator> to_add;
+        const size_t inner_num_rounds =
+            (i != num_iterations - 1) ? num_rounds_per_iteration : num_rounds_per_final_iteration;
+        for (size_t j = 0; j < inner_num_rounds; ++j) {
+            for (size_t k = 0; k < num_points; ++k) {
+                nafs[k] = (naf_entries[k][i * num_rounds_per_iteration + j + 1]);
+            }
+            to_add.emplace_back(point_table.get_chain_add_accumulator(nafs));
         }
+        accumulator = accumulator.multiple_montgomery_ladder(to_add);
     }
-    if ((num_rounds & 0x01ULL) == 0x00ULL) {
-        std::vector<bool_t<C>> nafs;
-        for (size_t j = 0; j < points.size(); ++j) {
-            nafs.emplace_back(naf_entries[j][num_rounds - 1]);
-        }
-        element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        if (add_1.is_element) {
-            element temp(add_1.x3_prev, add_1.y3_prev);
-            accumulator = accumulator.montgomery_ladder(temp);
-        } else {
-            accumulator = accumulator.montgomery_ladder(add_1);
-        }
-    }
-
     for (size_t i = 0; i < num_points; ++i) {
         element skew = accumulator - points[i];
         Fq out_x = accumulator.x.conditional_select(skew.x, naf_entries[i][num_rounds]);
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
index 63257d568738..49e43ba6e9cd 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
@@ -243,9 +243,15 @@ typename element<C, Fq, Fr, G>::secp256k1_wnaf_pair element<C, Fq, Fr, G>::compu
         // Compute and constrain skews
         field_t<C> negative_skew = witness_t<C>(ctx, is_negative ? 0 : skew);
         field_t<C> positive_skew = witness_t<C>(ctx, is_negative ? skew : 0);
-        negative_skew.create_range_constraint(1);
-        positive_skew.create_range_constraint(1);
-        (negative_skew + positive_skew).create_range_constraint(1);
+        if constexpr (C::type == ComposerType::PLOOKUP) {
+            ctx->create_new_range_constraint(negative_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_new_range_constraint(positive_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_new_range_constraint((negative_skew + positive_skew).witness_index, 1, "biggroup_nafs");
+        } else {
+            ctx->create_range_constraint(negative_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_range_constraint(positive_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_range_constraint((negative_skew + positive_skew).witness_index, 1, "biggroup_nafs");
+        }
 
         const auto reconstruct_bigfield_from_wnaf = [ctx](const std::vector<field_t<C>>& wnaf,
                                                           const field_t<C>& positive_skew,
@@ -378,14 +384,21 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
             offset_entry = (1ULL << (WNAF_SIZE - 1)) - 1 - (wnaf_values[i] & 0xffffff);
         }
         field_t<C> entry(witness_t<C>(ctx, offset_entry));
-
-        entry.create_range_constraint(WNAF_SIZE);
+        if constexpr (C::type == ComposerType::PLOOKUP) {
+            ctx->create_new_range_constraint(entry.witness_index, 1ULL << (WNAF_SIZE), "biggroup_nafs");
+        } else {
+            ctx->create_range_constraint(entry.witness_index, WNAF_SIZE, "biggroup_nafs");
+        }
         wnaf_entries.emplace_back(entry);
     }
 
     // add skew
     wnaf_entries.emplace_back(witness_t<C>(ctx, skew));
-    wnaf_entries[wnaf_entries.size() - 1].create_range_constraint(1);
+    if constexpr (C::type == ComposerType::PLOOKUP) {
+        ctx->create_new_range_constraint(wnaf_entries[wnaf_entries.size() - 1].witness_index, 1, "biggroup_nafs");
+    } else {
+        ctx->create_range_constraint(wnaf_entries[wnaf_entries.size() - 1].witness_index, 1, "biggroup_nafs");
+    }
 
     // TODO: VALIDATE SUM DOES NOT OVERFLOW P
 
@@ -494,15 +507,25 @@ std::vector<bool_t<C>> element<C, Fq, Fr, G>::compute_naf(const Fr& scalar, cons
             bit.context = ctx;
             bit.witness_index = witness_t<C>(ctx, true).witness_index; // flip sign
             bit.witness_bool = true;
-            ctx->create_range_constraint(
-                bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            if constexpr (C::type == ComposerType::PLOOKUP) {
+                ctx->create_new_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            } else {
+                ctx->create_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            }
             naf_entries[num_rounds - i - 1] = bit;
         } else {
             bool_t<C> bit(ctx, false);
             bit.witness_index = witness_t<C>(ctx, false).witness_index; // don't flip sign
             bit.witness_bool = false;
-            ctx->create_range_constraint(
-                bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            if constexpr (C::type == ComposerType::PLOOKUP) {
+                ctx->create_new_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            } else {
+                ctx->create_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            }
             naf_entries[num_rounds - i - 1] = bit;
         }
     }
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
index 7d510e2794a4..756f955a7a18 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
@@ -86,15 +86,18 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::secp256k1_ecdsa_mul(const element&
         // See `stdlib/memory/rom_table.hpp` for how indirect array accesses are implemented in UltraPlonk
         const auto& add_1 = endoP2_table[u2_hi_wnaf.wnaf[2 * i]];
         const auto& add_2 = P2_table[u2_lo_wnaf.wnaf[2 * i + 1]];
-        accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-
         const auto& add_3 = endoP1_table[u1_hi_wnaf.wnaf[i]];
         const auto& add_4 = P1_table[u1_lo_wnaf.wnaf[i]];
-        accumulator = accumulator.double_montgomery_ladder(add_3, add_4);
-
         const auto& add_5 = endoP2_table[u2_hi_wnaf.wnaf[2 * i + 1]];
         const auto& add_6 = P2_table[u2_lo_wnaf.wnaf[2 * i + 2]];
-        accumulator = accumulator.double_montgomery_ladder(add_5, add_6);
+
+        accumulator = accumulator.multiple_montgomery_ladder({ element::chain_add_accumulator(add_1),
+                                                               element::chain_add_accumulator(add_2),
+                                                               element::chain_add_accumulator(add_3) });
+
+        accumulator = accumulator.multiple_montgomery_ladder({ element::chain_add_accumulator(add_4),
+                                                               element::chain_add_accumulator(add_5),
+                                                               element::chain_add_accumulator(add_6) });
     }
 
     /**
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
index 9098799a7b1c..7f31017f6701 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
@@ -4,10 +4,27 @@ namespace proof_system::plonk {
 namespace stdlib {
 
 using plookup::MultiTableId;
+
+/**
+ * @brief Constructs a ROM table to look up linear combinations of group elements
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @tparam num_elements
+ * @tparam typename
+ * @param rom_data the ROM table we are writing into
+ * @param limb_max the maximum size of each limb in the ROM table.
+ *
+ * @details When reading a group element *out* of the ROM table, we must know the maximum value of each coordinate's
+ * limbs. We take this value to be the maximum of the maximum values of the input limbs into the table!
+ * @return std::array<twin_rom_table<C>, 5>
+ */
 template <typename C, class Fq, class Fr, class G>
 template <size_t num_elements, typename>
 std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom_tables(
-    const std::array<element, num_elements>& rom_data)
+    const std::array<element, num_elements>& rom_data, std::array<uint256_t, 8>& limb_max)
 {
     std::vector<std::array<field_t<C>, 2>> x_lo_limbs;
     std::vector<std::array<field_t<C>, 2>> x_hi_limbs;
@@ -16,6 +33,15 @@ std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom
     std::vector<std::array<field_t<C>, 2>> prime_limbs;
 
     for (size_t i = 0; i < num_elements; ++i) {
+        limb_max[0] = std::max(limb_max[0], rom_data[i].x.binary_basis_limbs[0].maximum_value);
+        limb_max[1] = std::max(limb_max[1], rom_data[i].x.binary_basis_limbs[1].maximum_value);
+        limb_max[2] = std::max(limb_max[2], rom_data[i].x.binary_basis_limbs[2].maximum_value);
+        limb_max[3] = std::max(limb_max[3], rom_data[i].x.binary_basis_limbs[3].maximum_value);
+        limb_max[4] = std::max(limb_max[4], rom_data[i].y.binary_basis_limbs[0].maximum_value);
+        limb_max[5] = std::max(limb_max[5], rom_data[i].y.binary_basis_limbs[1].maximum_value);
+        limb_max[6] = std::max(limb_max[6], rom_data[i].y.binary_basis_limbs[2].maximum_value);
+        limb_max[7] = std::max(limb_max[7], rom_data[i].y.binary_basis_limbs[3].maximum_value);
+
         x_lo_limbs.emplace_back(std::array<field_t<C>, 2>{ rom_data[i].x.binary_basis_limbs[0].element,
                                                            rom_data[i].x.binary_basis_limbs[1].element });
         x_hi_limbs.emplace_back(std::array<field_t<C>, 2>{ rom_data[i].x.binary_basis_limbs[2].element,
@@ -39,7 +65,7 @@ std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom
 template <typename C, class Fq, class Fr, class G>
 template <size_t, typename>
 element<C, Fq, Fr, G> element<C, Fq, Fr, G>::read_group_element_rom_tables(
-    const std::array<twin_rom_table<C>, 5>& tables, const field_t<C>& index)
+    const std::array<twin_rom_table<C>, 5>& tables, const field_t<C>& index, const std::array<uint256_t, 8>& limb_max)
 {
     const auto xlo = tables[0][index];
     const auto xhi = tables[1][index];
@@ -49,6 +75,15 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::read_group_element_rom_tables(
 
     Fq x_fq(xlo[0], xlo[1], xhi[0], xhi[1], xyprime[0]);
     Fq y_fq(ylo[0], ylo[1], yhi[0], yhi[1], xyprime[1]);
+    x_fq.binary_basis_limbs[0].maximum_value = limb_max[0];
+    x_fq.binary_basis_limbs[1].maximum_value = limb_max[1];
+    x_fq.binary_basis_limbs[2].maximum_value = limb_max[2];
+    x_fq.binary_basis_limbs[3].maximum_value = limb_max[3];
+    y_fq.binary_basis_limbs[0].maximum_value = limb_max[4];
+    y_fq.binary_basis_limbs[1].maximum_value = limb_max[5];
+    y_fq.binary_basis_limbs[2].maximum_value = limb_max[6];
+    y_fq.binary_basis_limbs[3].maximum_value = limb_max[7];
+
     const auto output = element(x_fq, y_fq);
     return output;
 }
@@ -64,17 +99,17 @@ element<C, Fq, Fr, G>::four_bit_table_plookup<X>::four_bit_table_plookup(const e
         element_table[i] = element_table[i - 1] + d2;
     }
     for (size_t i = 0; i < 8; ++i) {
-        element_table[i] = (-element_table[15 - i]).reduce();
+        element_table[i] = (-element_table[15 - i]);
     }
 
-    coordinates = create_group_element_rom_tables<16>(element_table);
+    coordinates = create_group_element_rom_tables<16>(element_table, limb_max);
 }
 
 template <typename C, class Fq, class Fr, class G>
 template <typename X>
 element<C, Fq, Fr, G> element<C, Fq, Fr, G>::four_bit_table_plookup<X>::operator[](const field_t<C>& index) const
 {
-    return read_group_element_rom_tables<16>(coordinates, index);
+    return read_group_element_rom_tables<16>(coordinates, index, limb_max);
 }
 
 template <class C, class Fq, class Fr, class G>
@@ -146,109 +181,134 @@ template <size_t length, typename X>
 element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::lookup_table_plookup(const std::array<element, length>& inputs)
 {
     if constexpr (length == 2) {
-        element_table[0] = inputs[1] + inputs[0];
-        element_table[1] = inputs[1] - inputs[0];
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]);
+        element_table[0] = A0;
+        element_table[1] = A1;
     } else if constexpr (length == 3) {
-        element R0 = inputs[1] + inputs[0];
-        element R1 = inputs[1] - inputs[0];
-        element_table[0] = inputs[2] + R0; // C + B + A
-        element_table[1] = inputs[2] + R1; // C + B - A
-        element_table[2] = inputs[2] - R1; // C - B + A
-        element_table[3] = inputs[2] - R0; // C - B - A
-    } else if constexpr (length == 4) {
-        element T0 = inputs[1] + inputs[0];
-        element T1 = inputs[1] - inputs[0];
-        element T2 = inputs[3] + inputs[2];
-        element T3 = inputs[3] - inputs[2];
+        auto [R0, R1] = inputs[1].add_sub(inputs[0]); // B ± A
 
-        element_table[0] = T2 + T0; // D + C + B + A
-        element_table[1] = T2 + T1; // D + C + B - A
-        element_table[2] = T2 - T1; // D + C - B + A
-        element_table[3] = T2 - T0; // D + C - B - A
-        element_table[4] = T3 + T0; // D - C + B + A
-        element_table[5] = T3 + T1; // D - C + B - A
-        element_table[6] = T3 - T1; // D - C - B + A
-        element_table[7] = T3 - T0; // D - C - B - A
-    } else if constexpr (length == 5) {
-        element A0 = inputs[1] + inputs[0]; // B + A
-        element A1 = inputs[1] - inputs[0]; // B - A
+        auto [T0, T1] = inputs[2].add_sub(R0); // C ± (B + A)
+        auto [T2, T3] = inputs[2].add_sub(R1); // C ± (B - A)
 
-        element T2 = inputs[3] + inputs[2]; // D + C
-        element T3 = inputs[3] - inputs[2]; // D - C
-
-        element E0 = inputs[4] + T2; // E + D + C // 0 0 0
-        element E1 = inputs[4] + T3; // E + D - C // 0 0 1
-        element E2 = inputs[4] - T3; // E - D + C // 0 1 0
-        element E3 = inputs[4] - T2; // E - D - C // 0 1 1
-
-        element_table[0] = E0 + A0;  // E + D + C + B + A // 0 0 0 0 0
-        element_table[1] = E0 + A1;  // E + D + C + B - A // 0 0 0 0 1
-        element_table[2] = E0 - A1;  // E + D + C - B + A // 0 0 0 1 0
-        element_table[3] = E0 - A0;  // E + D + C - B - A // 0 0 0 1 1
-        element_table[4] = E1 + A0;  // E + D - C + B + A // 0 0 1 0 0
-        element_table[5] = E1 + A1;  // E + D - C + B - A // 0 0 1 0 1
-        element_table[6] = E1 - A1;  // E + D - C - B + A // 0 0 1 1 0
-        element_table[7] = E1 - A0;  // E + D - C - B - A // 0 0 1 1 1
-        element_table[8] = E2 + A0;  // E - D + C + B + A // 0 1 0 0 0
-        element_table[9] = E2 + A1;  // E - D + C + B - A // 0 1 0 0 1
-        element_table[10] = E2 - A1; // E - D + C - B + A // 0 1 0 1 0
-        element_table[11] = E2 - A0; // E - D - C - B - A // 0 1 0 1 1
-        element_table[12] = E3 + A0; // E - D - C + B + A // 0 1 1 0 0
-        element_table[13] = E3 + A1; // E - D - C + B - A // 0 1 1 0 1
-        element_table[14] = E3 - A1; // E - D - C - B + A // 0 1 1 1 0
-        element_table[15] = E3 - A0; // E - D - C - B - A // 0 1 1 1 1
+        element_table[0] = T0;
+        element_table[1] = T2;
+        element_table[2] = T3;
+        element_table[3] = T1;
+    } else if constexpr (length == 4) {
+        auto [T0, T1] = inputs[1].add_sub(inputs[0]); // B ± A
+        auto [T2, T3] = inputs[3].add_sub(inputs[2]); // D ± C
+
+        auto [F0, F3] = T2.add_sub(T0); // (D + C) ± (B + A)
+        auto [F1, F2] = T2.add_sub(T1); // (D + C) ± (B - A)
+        auto [F4, F7] = T3.add_sub(T0); // (D - C) ± (B + A)
+        auto [F5, F6] = T3.add_sub(T1); // (D - C) ± (B - A)
+
+        element_table[0] = F0;
+        element_table[1] = F1;
+        element_table[2] = F2;
+        element_table[3] = F3;
+        element_table[4] = F4;
+        element_table[5] = F5;
+        element_table[6] = F6;
+        element_table[7] = F7;
+    } else if constexpr (length == 5) {
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]); // B ± A
+        auto [T2, T3] = inputs[3].add_sub(inputs[2]); // D ± C
+
+        auto [E0, E3] = inputs[4].add_sub(T2); // E ± (D + C)
+        auto [E1, E2] = inputs[4].add_sub(T3); // E ± (D - C)
+
+        auto [F0, F3] = E0.add_sub(A0);
+        auto [F1, F2] = E0.add_sub(A1);
+        auto [F4, F7] = E1.add_sub(A0);
+        auto [F5, F6] = E1.add_sub(A1);
+        auto [F8, F11] = E2.add_sub(A0);
+        auto [F9, F10] = E2.add_sub(A1);
+        auto [F12, F15] = E3.add_sub(A0);
+        auto [F13, F14] = E3.add_sub(A1);
+
+        element_table[0] = F0;
+        element_table[1] = F1;
+        element_table[2] = F2;
+        element_table[3] = F3;
+        element_table[4] = F4;
+        element_table[5] = F5;
+        element_table[6] = F6;
+        element_table[7] = F7;
+        element_table[8] = F8;
+        element_table[9] = F9;
+        element_table[10] = F10;
+        element_table[11] = F11;
+        element_table[12] = F12;
+        element_table[13] = F13;
+        element_table[14] = F14;
+        element_table[15] = F15;
     } else if constexpr (length == 6) {
         // 44 adds! Only use this if it saves us adding another table to a multi-scalar-multiplication
-        element A0 = inputs[1] + inputs[0]; // B + A
-        element A1 = inputs[1] - inputs[0]; // B - A
-        element E0 = inputs[4] + inputs[3]; // E + D
-        element E1 = inputs[4] - inputs[3]; // E - D
-
-        element C0 = inputs[2] + A0; //  C + B + A
-        element C1 = inputs[2] + A1; //  C + B - A
-        element C2 = inputs[2] - A1; //  C - B + A
-        element C3 = inputs[2] - A0; //  C - B - A
-
-        element F0 = inputs[5] + E0; // F + E + D
-        element F1 = inputs[5] + E1; // F + E - D
-        element F2 = inputs[5] - E1; // F - E + D
-        element F3 = inputs[5] - E0; // F - E - E
-
-        element_table[0] = F0 + C0;
-        element_table[1] = F0 + C1;
-        element_table[2] = F0 + C2;
-        element_table[3] = F0 + C3;
-        element_table[4] = F0 - C3;
-        element_table[5] = F0 - C2;
-        element_table[6] = F0 - C1;
-        element_table[7] = F0 - C0;
-
-        element_table[8] = F1 + C0;
-        element_table[9] = F1 + C1;
-        element_table[10] = F1 + C2;
-        element_table[11] = F1 + C3;
-        element_table[12] = F1 - C3;
-        element_table[13] = F1 - C2;
-        element_table[14] = F1 - C1;
-        element_table[15] = F1 - C0;
-
-        element_table[16] = F2 + C0;
-        element_table[17] = F2 + C1;
-        element_table[18] = F2 + C2;
-        element_table[19] = F2 + C3;
-        element_table[20] = F2 - C3;
-        element_table[21] = F2 - C2;
-        element_table[22] = F2 - C1;
-        element_table[23] = F2 - C0;
-
-        element_table[24] = F3 + C0;
-        element_table[25] = F3 + C1;
-        element_table[26] = F3 + C2;
-        element_table[27] = F3 + C3;
-        element_table[28] = F3 - C3;
-        element_table[29] = F3 - C2;
-        element_table[30] = F3 - C1;
-        element_table[31] = F3 - C0;
+
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]);
+        auto [E0, E1] = inputs[4].add_sub(inputs[3]);
+        auto [C0, C3] = inputs[2].add_sub(A0);
+        auto [C1, C2] = inputs[2].add_sub(A1);
+
+        auto [F0, F3] = inputs[5].add_sub(E0);
+        auto [F1, F2] = inputs[5].add_sub(E1);
+
+        auto [R0, R7] = F0.add_sub(C0);
+        auto [R1, R6] = F0.add_sub(C1);
+        auto [R2, R5] = F0.add_sub(C2);
+        auto [R3, R4] = F0.add_sub(C3);
+
+        auto [S0, S7] = F1.add_sub(C0);
+        auto [S1, S6] = F1.add_sub(C1);
+        auto [S2, S5] = F1.add_sub(C2);
+        auto [S3, S4] = F1.add_sub(C3);
+
+        auto [U0, U7] = F2.add_sub(C0);
+        auto [U1, U6] = F2.add_sub(C1);
+        auto [U2, U5] = F2.add_sub(C2);
+        auto [U3, U4] = F2.add_sub(C3);
+
+        auto [W0, W7] = F3.add_sub(C0);
+        auto [W1, W6] = F3.add_sub(C1);
+        auto [W2, W5] = F3.add_sub(C2);
+        auto [W3, W4] = F3.add_sub(C3);
+
+        element_table[0] = R0;
+        element_table[1] = R1;
+        element_table[2] = R2;
+        element_table[3] = R3;
+        element_table[4] = R4;
+        element_table[5] = R5;
+        element_table[6] = R6;
+        element_table[7] = R7;
+
+        element_table[8] = S0;
+        element_table[9] = S1;
+        element_table[10] = S2;
+        element_table[11] = S3;
+        element_table[12] = S4;
+        element_table[13] = S5;
+        element_table[14] = S6;
+        element_table[15] = S7;
+
+        element_table[16] = U0;
+        element_table[17] = U1;
+        element_table[18] = U2;
+        element_table[19] = U3;
+        element_table[20] = U4;
+        element_table[21] = U5;
+        element_table[22] = U6;
+        element_table[23] = U7;
+
+        element_table[24] = W0;
+        element_table[25] = W1;
+        element_table[26] = W2;
+        element_table[27] = W3;
+        element_table[28] = W4;
+        element_table[29] = W5;
+        element_table[30] = W6;
+        element_table[31] = W7;
     } else if constexpr (length == 7) {
         // 82 adds! This one is not worth using...
 
@@ -341,9 +401,9 @@ element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::lookup_table_plookup(con
         element_table[63] = G3 - E0;
     }
     for (size_t i = 0; i < table_size / 2; ++i) {
-        element_table[i + table_size / 2] = (-element_table[table_size / 2 - 1 - i]).reduce();
+        element_table[i + table_size / 2] = (-element_table[table_size / 2 - 1 - i]);
     }
-    coordinates = create_group_element_rom_tables<table_size>(element_table);
+    coordinates = create_group_element_rom_tables<table_size>(element_table, limb_max);
 }
 
 template <typename C, class Fq, class Fr, class G>
@@ -356,7 +416,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::ge
         accumulators.emplace_back(field_t<C>(bits[i]) * (1ULL << i));
     }
     field_t<C> index = field_t<C>::accumulate(accumulators);
-    return read_group_element_rom_tables<table_size>(coordinates, index);
+    return read_group_element_rom_tables<table_size>(coordinates, index, limb_max);
 }
 
 /**
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
index 64d7e69fc8f7..005ce5aec21a 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
@@ -28,9 +28,45 @@ template <typename Curve> struct aggregation_state {
         //    has_data == other.has_data; can't compare as native
     };
 
+    /**
+     * @brief TODO(@dbanks12 please migrate A3 circuits to using `assign_object_to_proof_outputs`. Much safer to not
+     * independently track `proof_witness_indices` and whether object has been assigned to public inputs)
+     *
+     */
     void add_proof_outputs_as_public_inputs()
     {
-        ASSERT(proof_witness_indices.size() > 0);
+        auto* context = P0.get_context();
+        context->add_recursive_proof(proof_witness_indices);
+    }
+
+    void assign_object_to_proof_outputs()
+    {
+        if (proof_witness_indices.size() == 0) {
+            std::cerr << "warning. calling `add_proof_outputs_as_public_inputs`, but aggregation object already has "
+                         "assigned proof outputs to public inputs.";
+            return;
+        }
+
+        P0 = P0.reduce();
+        P1 = P1.reduce();
+        proof_witness_indices = {
+            P0.x.binary_basis_limbs[0].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[1].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[2].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[3].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[0].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[1].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[2].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[3].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[0].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[1].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[2].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[3].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[0].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[1].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[2].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[3].element.normalize().witness_index,
+        };
 
         auto* context = P0.get_context();
 
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
index 5ca0439d9c04..9db2f12dc9e2 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
@@ -12,10 +12,11 @@
 #include "../../commitment/pedersen/pedersen_plookup.hpp"
 #include "../../primitives/bigfield/bigfield.hpp"
 #include "../../primitives/biggroup/biggroup.hpp"
-#include "../../primitives/bool/bool.hpp"
 #include "../../primitives/field/field.hpp"
 #include "../../primitives/witness/witness.hpp"
+#include "../../primitives/bool/bool.hpp"
 
+#include "../verification_key//verification_key.hpp"
 namespace proof_system::plonk {
 namespace stdlib {
 namespace recursion {
@@ -133,196 +134,140 @@ template <typename Composer> class Transcript {
             ++current_round;
             return;
         }
-        const size_t bytes_per_element = 31;
-
-        // split element into 2 limbs and insert into element_buffer
-        // each entry in element_buffer is 31 bytes
-        const auto split = [&](field_pt& work_element,
-                               std::vector<field_pt>& element_buffer,
-                               const field_pt& element,
-                               size_t& current_byte_counter,
-                               const size_t num_bytes) {
-            uint256_t element_u256(element.get_value());
-            size_t hi_bytes = bytes_per_element - current_byte_counter;
-            if (hi_bytes >= num_bytes) {
-                // hmm
-                size_t new_byte_counter = current_byte_counter + num_bytes;
-                field_pt hi = element;
-                const size_t leftovers = bytes_per_element - new_byte_counter;
-                field_pt buffer_shift =
-                    field_pt(context, barretenberg::fr(uint256_t(1) << ((uint64_t)leftovers * 8ULL)));
-                work_element = work_element + (hi * buffer_shift);
-                work_element = work_element.normalize();
-                current_byte_counter = new_byte_counter;
-                if (current_byte_counter == bytes_per_element) {
-                    current_byte_counter = 0;
-                    element_buffer.push_back(work_element);
-                    work_element = field_pt(context, barretenberg::fr(0));
-                }
-                return;
-            }
-            const size_t lo_bytes = num_bytes - hi_bytes;
-            field_pt lo = witness_t(context, barretenberg::fr(element_u256.slice(0, lo_bytes * 8)));
-            field_pt hi = witness_t(context, barretenberg::fr(element_u256.slice(lo_bytes * 8, 256)));
-            lo.create_range_constraint(lo_bytes * 8);
-            hi.create_range_constraint(hi_bytes * 8);
-            field_pt shift(context, barretenberg::fr(uint256_t(1ULL) << (uint64_t)lo_bytes * 8ULL));
-            field_pt sum = lo + (hi * shift);
-            if (!element.is_constant() || !sum.is_constant()) {
-                sum.assert_equal(element);
-            }
-            current_byte_counter = (current_byte_counter + num_bytes) % bytes_per_element;
-
-            // if current_byte_counter == 0 we've rolled over
-            if (current_byte_counter == 0) {
-                element_buffer.push_back(work_element + hi);
-                element_buffer.push_back(lo);
-                work_element = field_pt(context, 0);
-            } else {
-                work_element = work_element + hi;
-
-                element_buffer.push_back(work_element);
-
-                field_t lo_shift(
-                    context, barretenberg::fr(uint256_t(1ULL) << ((31ULL - (uint64_t)current_byte_counter) * 8ULL)));
-                work_element = (lo * lo_shift);
-                work_element = work_element.normalize();
-            }
-        };
-
-        std::vector<field_pt> compression_buffer;
         field_pt working_element(context);
 
-        size_t byte_counter = 0;
+        // maximum number of bytes we can store in a field element w/o wrapping modulus is 31.
+        // while we could store more *bits*, we want `preimage_buffer` to mirror how data is formatted
+        // when we serialize field/group elements natively (i.e. a byte array)
+        static constexpr size_t NUM_BITS_PER_PREIMAGE_ELEMENT = 31UL * 8UL;
+        PedersenPreimageBuilder<Composer, NUM_BITS_PER_PREIMAGE_ELEMENT> preimage_buffer(context);
         if (current_round > 0) {
-            split(working_element, compression_buffer, field_pt(current_challenge), byte_counter, 32);
+            preimage_buffer.add_element(current_challenge);
         }
         for (auto manifest_element : get_manifest().get_round_manifest(current_round).elements) {
             if (manifest_element.num_bytes == 32 && manifest_element.name != "public_inputs") {
-                split(working_element,
-                      compression_buffer,
-                      get_field_element(manifest_element.name),
-                      byte_counter,
-                      manifest_element.num_bytes);
+                preimage_buffer.add_element(get_field_element(manifest_element.name));
             } else if (manifest_element.num_bytes == 64 && manifest_element.name != "public_inputs") {
                 group_pt point = get_circuit_group_element(manifest_element.name);
 
-                field_pt y_hi =
-                    point.y.binary_basis_limbs[2].element + (point.y.binary_basis_limbs[3].element * fq_pt::shift_1);
-                field_pt y_lo =
-                    point.y.binary_basis_limbs[0].element + (point.y.binary_basis_limbs[1].element * fq_pt::shift_1);
-                field_pt x_hi =
-                    point.x.binary_basis_limbs[2].element + (point.x.binary_basis_limbs[3].element * fq_pt::shift_1);
-                field_pt x_lo =
-                    point.x.binary_basis_limbs[0].element + (point.x.binary_basis_limbs[1].element * fq_pt::shift_1);
-                const size_t lo_bytes = fq_pt::NUM_LIMB_BITS / 4;
-                const size_t hi_bytes = 32 - lo_bytes;
-
-                split(working_element, compression_buffer, y_hi, byte_counter, hi_bytes);
-                split(working_element, compression_buffer, y_lo, byte_counter, lo_bytes);
-                split(working_element, compression_buffer, x_hi, byte_counter, hi_bytes);
-                split(working_element, compression_buffer, x_lo, byte_counter, lo_bytes);
+                // In our buffer, we want to represent each field element as occupying 256 bits of data (to match what
+                // the native transcript does)
+                const auto& x = point.x;
+                const auto& y = point.y;
+                constexpr size_t last_limb_bits = 256 - (fq_pt::NUM_LIMB_BITS * 3);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[3].element,
+                                                                           last_limb_bits);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[2].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[1].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[0].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[3].element,
+                                                                           last_limb_bits);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[2].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[1].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[0].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+
             } else if (manifest_element.name == "public_inputs") {
                 std::vector<field_pt> field_array = get_field_element_vector(manifest_element.name);
                 for (size_t i = 0; i < field_array.size(); ++i) {
-                    split(working_element, compression_buffer, field_array[i], byte_counter, 32);
+                    preimage_buffer.add_element(field_array[i]);
                 }
             } else if (manifest_element.num_bytes < 32 && manifest_element.name != "public_inputs") {
-                split(working_element,
-                      compression_buffer,
-                      get_field_element(manifest_element.name),
-                      byte_counter,
-                      manifest_element.num_bytes);
+                // TODO(zac): init round data is being grabbed out of the manifest and not the vkey
+                preimage_buffer.add_element_with_existing_range_constraint(get_field_element(manifest_element.name),
+                                                                           manifest_element.num_bytes * 8);
             }
         }
-        std::vector<byte_array<Composer>> round_challenges;
+        std::vector<field_pt> round_challenges_new;
 
-        if (byte_counter != 0) {
-            const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-            working_element = working_element / barretenberg::fr(down_shift);
-            working_element = working_element.normalize();
+        field_pt T0;
+        T0 = preimage_buffer.compress(0);
 
-            compression_buffer.push_back(working_element);
-        }
+        // helper method to slice a challenge into 128-bit slices
+        const auto slice_into_halves = [&](const field_pt& in, const size_t low_bits = 128) {
+            uint256_t v = in.get_value();
+            uint256_t lo = v.slice(0, low_bits);
+            uint256_t hi = v.slice(low_bits, 256);
 
-        field_pt T0;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            T0 = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
-        } else {
-            T0 = stdlib::pedersen_commitment<Composer>::compress(compression_buffer);
-        }
-        byte_array<Composer> compressed_buffer(T0);
+            field_pt y_lo = field_pt::from_witness(context, lo);
+            field_pt y_hi = field_pt::from_witness(context, hi);
 
-        // TODO(@zac-williamson) make this a Poseidon hash
-        byte_array<Composer> base_hash;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            std::vector<field_pt> compression_buffer;
-            field_pt working_element(context);
-            size_t byte_counter = 0;
-            split(working_element, compression_buffer, field_pt(compressed_buffer), byte_counter, 32);
-            if (byte_counter != 0) {
-                const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-                working_element = working_element / barretenberg::fr(down_shift);
-                working_element = working_element.normalize();
-                compression_buffer.push_back(working_element);
+            y_lo.create_range_constraint(low_bits);
+            y_hi.create_range_constraint(254 - low_bits);
+
+            in.add_two(-y_lo, -y_hi * (uint256_t(1) << low_bits)).assert_equal(0);
+
+            // Validate the sum of our two halves does not exceed the circuit modulus over the integers
+            constexpr uint256_t modulus = fr::modulus;
+            const field_pt r_lo = field_pt(context, modulus.slice(0, low_bits));
+            const field_pt r_hi = field_pt(context, modulus.slice(low_bits, 256));
+
+            bool need_borrow = (uint256_t(y_lo.get_value()) > uint256_t(r_lo.get_value()));
+            field_pt borrow = field_pt::from_witness(context, need_borrow);
+
+            // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+            if constexpr (Composer::type == ComposerType::PLOOKUP) {
+                context->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            } else {
+                context->create_range_constraint(borrow.get_witness_index(), 1, "borrow");
             }
-            base_hash = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
+
+            // Hi range check = r_hi - y_hi - borrow
+            // Lo range check = r_lo - y_lo + borrow * 2^{126}
+            field_pt res_hi = (r_hi - y_hi) - borrow;
+            field_pt res_lo = (r_lo - y_lo) + (borrow * (uint256_t(1) << low_bits));
+
+            res_hi.create_range_constraint(modulus.get_msb() + 1 - low_bits);
+            res_lo.create_range_constraint(low_bits);
+
+            return std::array<field_pt, 2>{ y_lo, y_hi };
+        };
+
+        field_pt base_hash;
+        if constexpr (Composer::type == ComposerType::PLOOKUP) {
+            base_hash = stdlib::pedersen_plookup_commitment<Composer>::compress(std::vector<field_pt>{ T0 }, 0);
         } else {
-            base_hash = stdlib::blake3s(compressed_buffer);
+            base_hash = stdlib::pedersen_commitment<Composer>::compress(std::vector<field_pt>{ T0 }, 0);
         }
-        byte_array<Composer> first(field_pt(0), 16);
-        first.write(base_hash.slice(0, 16));
-        round_challenges.push_back(first);
+        auto hash_halves = slice_into_halves(base_hash);
+        round_challenges_new.push_back(hash_halves[1]);
 
         if (num_challenges > 1) {
-            byte_array<Composer> second(field_pt(0), 16);
-            second.write(base_hash.slice(16, 16));
-            round_challenges.push_back(second);
+            round_challenges_new.push_back(hash_halves[0]);
         }
+        base_hash = (slice_into_halves(base_hash, 8)[1] * 256).normalize();
 
-        // This block of code only executes for num_challenges > 2, which (currently) only happens in the nu round when
-        // we need to generate short scalars. In this case, we generate 32-byte challenges and split them in half to get
-        // the relevant challenges.
+        // This block of code only executes for num_challenges > 2, which (currently) only happens in the nu round
+        // when we need to generate short scalars. In this case, we generate 32-byte challenges and split them in
+        // half to get the relevant challenges.
         for (size_t i = 2; i < num_challenges; i += 2) {
-            byte_array<Composer> rolling_buffer = base_hash;
-            byte_array<Composer> hash_output;
+            // TODO(@zac-williamson) make this a Poseidon hash not a Pedersen hash
+            field_pt hash_output;
             if constexpr (Composer::type == ComposerType::PLOOKUP) {
-                // TODO(@zac-williamson) make this a Poseidon hash not a Pedersen hash
-                std::vector<field_pt> compression_buffer;
-                field_pt working_element(context);
-                size_t byte_counter = 0;
-                split(working_element, compression_buffer, field_pt(rolling_buffer), byte_counter, 32);
-                split(working_element, compression_buffer, field_pt(field_pt(i / 2)), byte_counter, 1);
-                if (byte_counter != 0) {
-                    const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-                    working_element = working_element / barretenberg::fr(down_shift);
-                    working_element = working_element.normalize();
-                    compression_buffer.push_back(working_element);
-                }
-                hash_output = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
+                hash_output = stdlib::pedersen_plookup_commitment<Composer>::compress(
+                    std::vector<field_pt>{ (base_hash + field_pt(i / 2)).normalize() }, 0);
             } else {
-                rolling_buffer.write(byte_array<Composer>(field_pt(i / 2), 1));
-                hash_output = stdlib::blake3s(rolling_buffer);
+                hash_output = stdlib::pedersen_commitment<Composer>::compress(
+                    std::vector<field_pt>{ (base_hash + field_pt(i / 2)).normalize() }, 0);
             }
-            byte_array<Composer> hi(field_pt(0), 16);
-            hi.write(hash_output.slice(0, 16));
-            round_challenges.push_back(hi);
-
+            auto hash_halves = slice_into_halves(hash_output);
+            round_challenges_new.push_back(hash_halves[1]);
             if (i + 1 < num_challenges) {
-                byte_array<Composer> lo(field_pt(0), 16);
-                lo.write(hash_output.slice(16, 16));
-                round_challenges.push_back(lo);
+                round_challenges_new.push_back(hash_halves[0]);
             }
         }
-
-        current_challenge = round_challenges[round_challenges.size() - 1];
+        current_challenge = round_challenges_new[round_challenges_new.size() - 1];
         ++current_round;
-
         challenge_keys.push_back(challenge_name);
 
         std::vector<field_pt> challenge_elements;
-        for (const auto& challenge : round_challenges) {
-            challenge_elements.push_back(static_cast<field_pt>(challenge));
+        for (const auto& challenge : round_challenges_new) {
+            challenge_elements.push_back(challenge);
         }
         challenge_values.push_back(challenge_elements);
     }
@@ -420,7 +365,7 @@ template <typename Composer> class Transcript {
 
   private:
     transcript::Transcript transcript_base;
-    byte_array<Composer> current_challenge;
+    field_pt current_challenge;
 
     mutable std::vector<std::string> field_vector_keys;
     mutable std::vector<std::vector<field_pt>> field_vector_values;
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
index 57b8a39207c5..915f63e5a32f 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
@@ -20,10 +20,163 @@
 #include "../../commitment/pedersen/pedersen_plookup.hpp"
 #include "../../primitives/curves/bn254.hpp"
 
+#include "barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp"
+
 namespace proof_system::plonk {
 namespace stdlib {
 namespace recursion {
 
+/**
+ * @brief Constructs a packed buffer of field elements to be fed into a Pedersen compress function
+ *        Goal is to concatenate multiple inputs together into a single field element if the inputs are known to be
+ * small. Produces a vector of field elements where the maximum number of bits per element is `bits_per_element`.
+ *
+ * @details When calling `pedersen::compress` on the final buffer, we can skip the range checks normally performed in
+ * the compress method, because we know the sums of the scalar slices cannot exceed the field modulus. This requires
+ * `bits_per_element < modulus bits`
+ * @tparam Composer
+ * @tparam bits_per_element
+ */
+template <class Composer, size_t bits_per_element = 248> struct PedersenPreimageBuilder {
+    using field_pt = field_t<Composer>;
+    using witness_pt = witness_t<Composer>;
+
+    Composer* context;
+
+    PedersenPreimageBuilder(Composer* ctx = nullptr)
+        : context(ctx){};
+
+    field_pt compress(const size_t hash_index)
+    {
+        // we can only use relaxed range checks in pedersen::compress iff bits_per_element < modulus bits
+        static_assert(bits_per_element < uint256_t(barretenberg::fr::modulus).get_msb());
+
+        if (current_bit_counter != 0) {
+            const uint256_t down_shift = uint256_t(1) << uint256_t((bits_per_element - current_bit_counter));
+            for (auto& x : work_element) {
+                x = x / barretenberg::fr(down_shift);
+            }
+            preimage_data.push_back(field_pt::accumulate(work_element));
+        }
+        if constexpr (Composer::type == ComposerType::PLOOKUP) {
+            return pedersen_plookup_commitment<Composer>::compress_with_relaxed_range_constraints(preimage_data,
+                                                                                                  hash_index);
+        } else {
+            return pedersen_commitment<Composer>::compress(preimage_data, hash_index);
+        }
+    }
+
+    /**
+     * @brief preimage_data is a bit-array where `bits_per_element` number of bits are packed into a single field
+     * element
+     */
+    std::vector<field_pt> preimage_data;
+
+    /**
+     * @brief work_element represents the leading element to be added into `preimage_data`.
+     *        Vector is composed of field elements that represent bit chunks of a known length,
+     *        such that the sum of the bit chunks < bits_per_element
+     */
+    std::vector<field_pt> work_element;
+
+    size_t current_bit_counter = 0;
+
+    void add_element(const field_pt& element) { slice_element(element, 256); }
+
+    void add_element_with_existing_range_constraint(const field_pt& element, const size_t num_bits)
+    {
+        slice_element(element, num_bits);
+    }
+
+    /**
+     * @brief Populate `preimage_data` with element whose size is known to be `num_bits`.
+     * `preimage_data` is treated as a bit-array where `bits_per_element` number of bits are packed into a single field
+     * element. `slice_element` will:
+     *
+     * 1. determine how many bits are remaining in work_element
+     * 2. if remaining bits > num_bits, slice `element` into 2 chunks hi/lo
+     * 3. fill work_element with `hi` chunk (or the full element if possible)
+     * 4. (if work_element is full) combine work_element chunks into a field element and push onto `preimage_data`
+     * 4. (if required) create a new work_element and populate with `lo`
+     *
+     * @param element
+     * @param num_bits
+     */
+    void slice_element(const field_pt& element, const size_t num_bits)
+    {
+        ASSERT(context != nullptr);
+        uint256_t element_u256(element.get_value());
+        size_t hi_bits = bits_per_element - current_bit_counter;
+        if (hi_bits >= num_bits) {
+            // hmm
+            size_t new_bit_counter = current_bit_counter + num_bits;
+            field_pt hi = element;
+            const size_t leftovers = bits_per_element - new_bit_counter;
+            field_pt buffer_shift = field_pt(context, barretenberg::fr(uint256_t(1) << ((uint64_t)leftovers)));
+            work_element.emplace_back(hi * buffer_shift);
+            current_bit_counter = new_bit_counter;
+            if (current_bit_counter == bits_per_element) {
+                current_bit_counter = 0;
+                preimage_data.push_back(field_pt::accumulate(work_element));
+
+                work_element = std::vector<field_pt>();
+            }
+            return;
+        }
+        const size_t lo_bits = num_bits - hi_bits;
+        field_pt lo = witness_t(context, barretenberg::fr(element_u256.slice(0, lo_bits)));
+        field_pt hi = witness_t(context, barretenberg::fr(element_u256.slice(lo_bits, 256)));
+        lo.create_range_constraint(lo_bits);
+        hi.create_range_constraint(hi_bits);
+        field_pt shift(context, barretenberg::fr(uint256_t(1ULL) << (uint64_t)lo_bits));
+        if (!element.is_constant() || !lo.is_constant() || !hi.is_constant()) {
+            lo.add_two(hi * shift, -element).assert_equal(0);
+        }
+
+        constexpr uint256_t modulus = barretenberg::fr::modulus;
+        constexpr size_t modulus_bits = modulus.get_msb();
+
+        // If our input is a full field element we must validate the sum of our slices is < p
+        if (num_bits >= modulus_bits) {
+            const field_pt r_lo = field_pt(context, modulus.slice(0, lo_bits));
+            const field_pt r_hi = field_pt(context, modulus.slice(lo_bits, num_bits));
+
+            bool need_borrow = (uint256_t(lo.get_value()) > uint256_t(r_lo.get_value()));
+            field_pt borrow = field_pt::from_witness(context, need_borrow);
+
+            // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+            if constexpr (Composer::type == ComposerType::PLOOKUP) {
+                context->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            } else {
+                context->create_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            }
+            // Hi range check = r_hi - y_hi - borrow
+            // Lo range check = r_lo - y_lo + borrow * 2^{126}
+            field_t res_hi = (r_hi - hi) - borrow;
+            field_t res_lo = (r_lo - lo) + (borrow * (uint256_t(1) << lo_bits));
+
+            res_hi.create_range_constraint(modulus_bits + 1 - lo_bits);
+            res_lo.create_range_constraint(lo_bits);
+        }
+        current_bit_counter = (current_bit_counter + num_bits) % bits_per_element;
+
+        // if current_bit_counter == 0 we've rolled over
+        if (current_bit_counter == 0) {
+            work_element.emplace_back(hi);
+            preimage_data.push_back(field_pt::accumulate(work_element));
+            preimage_data.push_back(lo);
+            work_element = std::vector<field_pt>();
+        } else {
+            work_element.emplace_back(hi);
+            preimage_data.push_back(field_pt::accumulate(work_element));
+            field_t lo_shift(context,
+                             barretenberg::fr(uint256_t(1ULL) << ((bits_per_element - (uint64_t)current_bit_counter))));
+            work_element = std::vector<field_pt>();
+            work_element.emplace_back(lo * lo_shift);
+        }
+    };
+};
+
 template <typename Composer> struct evaluation_domain {
     static evaluation_domain from_witness(Composer* ctx, const barretenberg::evaluation_domain& input)
     {
@@ -51,44 +204,6 @@ template <typename Composer> struct evaluation_domain {
         return domain;
     }
 
-    field_t<Composer> compress() const
-    {
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            field_t<Composer> out = pedersen_plookup_commitment<Composer>::compress({
-                root,
-                domain,
-                generator,
-            });
-            return out;
-        } else {
-            field_t<Composer> out = pedersen_commitment<Composer>::compress({
-                root,
-                domain,
-                generator,
-            });
-            return out;
-        }
-    }
-
-    static barretenberg::fr compress_native(const barretenberg::evaluation_domain& input)
-    {
-        barretenberg::fr out;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            out = crypto::pedersen_commitment::lookup::compress_native({
-                input.root,
-                input.domain,
-                input.generator,
-            });
-        } else {
-            out = crypto::pedersen_commitment::compress_native({
-                input.root,
-                input.domain,
-                input.generator,
-            });
-        }
-        return out;
-    }
-
     field_t<Composer> root;
     field_t<Composer> root_inverse;
     field_t<Composer> domain;
@@ -120,9 +235,15 @@ template <typename Curve> struct verification_key {
         key->num_public_inputs = witness_t<Composer>(ctx, input_key->num_public_inputs);
         key->domain = evaluation_domain<Composer>::from_witness(ctx, input_key->domain);
         key->contains_recursive_proof = witness_t<Composer>(ctx, input_key->contains_recursive_proof);
-
         for (const auto& [tag, value] : input_key->commitments) {
-            key->commitments.insert({ tag, Curve::g1_ct::from_witness(ctx, value) });
+            // We do not perform on_curve() circuit checks when constructing the Curve::g1_ct element.
+            // The assumption is that the circuit creator is honest and that the verification key hash (or some other
+            // method) will be used to ensure the provided key matches the key produced by the circuit creator.
+            // If the circuit creator is not honest, the entire set of circuit constraints being proved over cannot be
+            // trusted!
+            const typename Curve::fq_ct x = Curve::fq_ct::from_witness(ctx, value.x);
+            const typename Curve::fq_ct y = Curve::fq_ct::from_witness(ctx, value.y);
+            key->commitments.insert({ tag, typename Curve::g1_ct(x, y) });
         }
 
         return key;
@@ -189,71 +310,65 @@ template <typename Curve> struct verification_key {
   public:
     field_t<Composer> compress(size_t const hash_index = 0)
     {
-        field_t<Composer> compressed_domain = domain.compress();
-
-        std::vector<field_t<Composer>> preimage_data;
-        preimage_data.push_back(Composer::type);
-        preimage_data.push_back(compressed_domain);
-        preimage_data.push_back(num_public_inputs);
+        PedersenPreimageBuilder<Composer> preimage_buffer(context);
+
+        field_t<Composer> composer_type = witness_t<Composer>::create_constant_witness(context, Composer::type);
+        domain.generator.create_range_constraint(16, "domain.generator");
+        domain.domain.create_range_constraint(32, "domain.generator");
+        num_public_inputs.create_range_constraint(32, "num_public_inputs");
+        preimage_buffer.add_element_with_existing_range_constraint(composer_type, 8);
+        preimage_buffer.add_element_with_existing_range_constraint(domain.generator, 16); // coset generator is small
+        preimage_buffer.add_element_with_existing_range_constraint(domain.domain, 32);
+        preimage_buffer.add_element_with_existing_range_constraint(num_public_inputs, 32);
+        constexpr size_t limb_bits = Curve::fq_ct::NUM_LIMB_BITS;
+        constexpr size_t last_limb_bits = 256 - (limb_bits * 3);
         for (const auto& [tag, selector] : commitments) {
-            preimage_data.push_back(selector.x.binary_basis_limbs[0].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[1].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[2].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[3].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[0].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[1].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[2].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[3].element);
-        }
-
-        field_t<Composer> compressed_key;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            compressed_key = pedersen_plookup_commitment<Composer>::compress(preimage_data, hash_index);
-        } else {
-            compressed_key = pedersen_commitment<Composer>::compress(preimage_data, hash_index);
+            const auto& x = selector.x;
+            const auto& y = selector.y;
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[3].element, last_limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[2].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[1].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[0].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[3].element, last_limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[2].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[1].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[0].element, limb_bits);
         }
+        preimage_buffer.add_element(domain.root);
+        field_t<Composer> compressed_key = preimage_buffer.compress(hash_index);
         return compressed_key;
     }
 
-    static barretenberg::fr compress_native(const std::shared_ptr<plonk::verification_key>& key,
-                                            const size_t hash_index = 0)
+    static barretenberg::fr compress_native(const std::shared_ptr<plonk::verification_key>& key, const size_t = 0)
     {
-        barretenberg::fr compressed_domain = evaluation_domain<Composer>::compress_native(key->domain);
-
-        constexpr size_t num_limb_bits = bn254<plonk::UltraComposer>::fq_ct::NUM_LIMB_BITS;
-        const auto split_bigfield_limbs = [](const uint256_t& element) {
-            std::vector<barretenberg::fr> limbs;
-            limbs.push_back(element.slice(0, num_limb_bits));
-            limbs.push_back(element.slice(num_limb_bits, num_limb_bits * 2));
-            limbs.push_back(element.slice(num_limb_bits * 2, num_limb_bits * 3));
-            limbs.push_back(element.slice(num_limb_bits * 3, num_limb_bits * 4));
-            return limbs;
-        };
-
-        std::vector<barretenberg::fr> preimage_data;
-        preimage_data.push_back(Composer::type);
-        preimage_data.push_back(compressed_domain);
-        preimage_data.push_back(key->num_public_inputs);
+        std::vector<uint8_t> preimage_data;
+
+        preimage_data.push_back(static_cast<uint8_t>(Composer::type));
+
+        const uint256_t domain = key->domain.domain;
+        const uint256_t generator = key->domain.generator;
+        const uint256_t num_public_inputs = key->num_public_inputs;
+
+        ASSERT(domain < (uint256_t(1) << 32));
+        ASSERT(generator < (uint256_t(1) << 16));
+        ASSERT(num_public_inputs < (uint256_t(1) << 32));
+
+        write(preimage_data, static_cast<uint16_t>(uint256_t(key->domain.generator)));
+        write(preimage_data, static_cast<uint32_t>(uint256_t(key->domain.domain)));
+        write(preimage_data, static_cast<uint32_t>(key->num_public_inputs));
         for (const auto& [tag, selector] : key->commitments) {
-            const auto x_limbs = split_bigfield_limbs(selector.x);
-            const auto y_limbs = split_bigfield_limbs(selector.y);
-
-            preimage_data.push_back(x_limbs[0]);
-            preimage_data.push_back(x_limbs[1]);
-            preimage_data.push_back(x_limbs[2]);
-            preimage_data.push_back(x_limbs[3]);
-
-            preimage_data.push_back(y_limbs[0]);
-            preimage_data.push_back(y_limbs[1]);
-            preimage_data.push_back(y_limbs[2]);
-            preimage_data.push_back(y_limbs[3]);
+            write(preimage_data, selector.y);
+            write(preimage_data, selector.x);
         }
 
+        write(preimage_data, key->domain.root);
+
         barretenberg::fr compressed_key;
         if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            compressed_key = crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index);
+            compressed_key =
+                from_buffer<barretenberg::fr>(crypto::pedersen_commitment::lookup::compress_native(preimage_data));
         } else {
-            compressed_key = crypto::pedersen_commitment::compress_native(preimage_data, hash_index);
+            compressed_key = crypto::pedersen_commitment::compress_native(preimage_data);
         }
         return compressed_key;
     }
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
index aa185e34d7c4..009e732b209c 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
@@ -37,7 +37,6 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
                                typename Curve::fr_ct& batch_opening_scalar)
 {
     using fr_ct = typename Curve::fr_ct;
-    using g1_ct = typename Curve::g1_ct;
     const auto& polynomial_manifest = key->polynomial_manifest;
     for (size_t i = 0; i < key->polynomial_manifest.size(); ++i) {
         const auto& item = polynomial_manifest[i];
@@ -45,14 +44,14 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
         const std::string poly_label(item.polynomial_label);
         switch (item.source) {
         case PolynomialSource::WITNESS: {
-            const auto element = transcript.get_group_element(label);
-            ASSERT(element.on_curve());
-            if (element.is_point_at_infinity()) {
+            // get_circuit_group_element validates that the point produced lies on the curve
+            const auto element = transcript.get_circuit_group_element(label);
+            ASSERT(element.get_value().on_curve());
+            if (element.get_value().is_point_at_infinity()) {
                 std::cerr << label << " witness is point at infinity! Error!" << std::endl;
                 ctx->failure("witness " + label + " is point at infinity");
             }
-            // g1_ct::from_witness validates that the point produced lies on the curve
-            kate_g1_elements.insert({ label, g1_ct::from_witness(ctx, element) });
+            kate_g1_elements.insert({ label, element });
             break;
         }
         case PolynomialSource::SELECTOR:
@@ -89,15 +88,15 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
     fr_ct z_power = 1;
     for (size_t i = 0; i < program_settings::program_width; ++i) {
         std::string quotient_label = "T_" + std::to_string(i + 1);
-        const auto element = transcript.get_group_element(quotient_label);
+        const auto element = transcript.get_circuit_group_element(quotient_label);
 
-        kate_g1_elements.insert({ quotient_label, g1_ct::from_witness(ctx, element) });
+        kate_g1_elements.insert({ quotient_label, element });
         kate_fr_elements_at_zeta_large.insert({ quotient_label, quotient_nu * z_power });
         z_power *= key->z_pow_n;
     }
 
-    const auto PI_Z = transcript.get_group_element("PI_Z");
-    const auto PI_Z_OMEGA = transcript.get_group_element("PI_Z_OMEGA");
+    const auto PI_Z = transcript.get_circuit_group_element("PI_Z");
+    const auto PI_Z_OMEGA = transcript.get_circuit_group_element("PI_Z_OMEGA");
 
     fr_ct u = transcript.get_challenge_field_element("separator", 0);
 
@@ -105,10 +104,10 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
         proof_system::plonk::compute_kate_batch_evaluation<fr_ct, Transcript, program_settings>(key, transcript);
     batch_opening_scalar = -batch_evaluation;
 
-    kate_g1_elements.insert({ "PI_Z_OMEGA", g1_ct::from_witness(ctx, PI_Z_OMEGA) });
+    kate_g1_elements.insert({ "PI_Z_OMEGA", PI_Z_OMEGA });
     kate_fr_elements_at_zeta_large.insert({ "PI_Z_OMEGA", zeta * key->domain.root * u });
 
-    kate_g1_elements.insert({ "PI_Z", g1_ct::from_witness(ctx, PI_Z) });
+    kate_g1_elements.insert({ "PI_Z", PI_Z });
     kate_fr_elements_at_zeta.insert({ "PI_Z", zeta });
 }
 
@@ -287,18 +286,6 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
 
     for (const auto& [label, fr_value] : kate_fr_elements_at_zeta_omega) {
         const auto& g1_value = kate_g1_elements[label];
-        // if (fr_value.get_value() == 0 && fr_value.witness_index != IS_CONSTANT   )
-        // {
-        //     std::cerr << "bad scalar zero at " << label << std::endl;
-        // }
-        // if (fr_value.get_value() == 0 && fr_value.witness_index == IS_CONSTANT) {
-        //     std::cerr << "scalar zero at " << label << std::endl;
-        //     continue;
-        // }
-
-        // if (fr_value.get_value() == 0 && fr_value.witness_index == IS_CONSTANT) {
-        //     continue;
-        // }
         double_opening_scalars.emplace_back(fr_value);
         double_opening_elements.emplace_back(g1_value);
     }
@@ -320,8 +307,7 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         opening_elements.push_back(previous_output.P0);
         opening_scalars.push_back(random_separator);
 
-        rhs_elements.push_back(
-            (-(previous_output.P1)).reduce()); // TODO: use .normalize() instead? (As per defi bridge project)
+        rhs_elements.push_back((-(previous_output.P1)));
         rhs_scalars.push_back(random_separator);
     }
 
@@ -344,6 +330,10 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
                 const fr_ct l1 = public_inputs[idx1];
                 const fr_ct l2 = public_inputs[idx2];
                 const fr_ct l3 = public_inputs[idx3];
+                l0.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l0");
+                l1.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l1");
+                l2.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l2");
+                l3.create_range_constraint(fq_ct::NUM_LAST_LIMB_BITS, "l3");
                 return fq_ct(l0, l1, l2, l3, false);
             };
 
@@ -369,7 +359,7 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         opening_elements.push_back(g1_ct(x0, y0));
         opening_scalars.push_back(recursion_separator_challenge);
 
-        rhs_elements.push_back((-g1_ct(x1, y1)).normalize());
+        rhs_elements.push_back((-g1_ct(x1, y1)));
         rhs_scalars.push_back(recursion_separator_challenge);
     }
 
@@ -380,13 +370,13 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
     for (const auto& to_add : elements_to_add) {
         opening_result = opening_result + to_add;
     }
-    opening_result = opening_result.normalize();
 
     g1_ct rhs = g1_ct::template wnaf_batch_mul<128>(rhs_elements, rhs_scalars);
-    rhs = rhs + PI_Z;
-    rhs = (-rhs).normalize();
 
-    std::vector<uint32_t> proof_witness_indices{
+    rhs = (-rhs) - PI_Z;
+
+    // TODO(zac: remove this once a3-packages has migrated to calling `assign_object_to_proof_outputs`)
+    std::vector<uint32_t> proof_witness_indices = {
         opening_result.x.binary_basis_limbs[0].element.normalize().witness_index,
         opening_result.x.binary_basis_limbs[1].element.normalize().witness_index,
         opening_result.x.binary_basis_limbs[2].element.normalize().witness_index,
@@ -404,10 +394,10 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         rhs.y.binary_basis_limbs[2].element.normalize().witness_index,
         rhs.y.binary_basis_limbs[3].element.normalize().witness_index,
     };
-
-    return aggregation_state<Curve>{
-        opening_result, rhs, transcript.get_field_element_vector("public_inputs"), proof_witness_indices, true,
+    auto result = aggregation_state<Curve>{
+        opening_result, rhs, transcript.get_field_element_vector("public_inputs"), proof_witness_indices, true
     };
+    return result;
 }
 
 } // namespace recursion
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
index 82e8042b02fa..db3dfc2cc1e6 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
@@ -212,6 +212,8 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
             stdlib::recursion::verify_proof<outer_curve, RecursiveSettings>(
                 &outer_composer, verification_key_b, recursive_manifest, recursive_proof_b, previous_output);
 
+        verification_key_b->compress();
+        verification_key->compress();
         return { output, verification_key };
     }
 
@@ -301,7 +303,7 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
 
@@ -346,7 +348,7 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
 
@@ -379,6 +381,9 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         InnerComposer inner_composer_a = InnerComposer("../srs_db/ignition");
         InnerComposer inner_composer_b = InnerComposer("../srs_db/ignition");
 
+        OuterComposer mid_composer_a = OuterComposer("../srs_db/ignition");
+        OuterComposer mid_composer_b = OuterComposer("../srs_db/ignition");
+
         OuterComposer outer_composer = OuterComposer("../srs_db/ignition");
 
         std::vector<barretenberg::fr> inner_inputs{ barretenberg::fr::random_element(),
@@ -388,7 +393,27 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         create_inner_circuit(inner_composer_a, inner_inputs);
         create_inner_circuit(inner_composer_b, inner_inputs);
 
-        auto circuit_output = create_double_outer_circuit(inner_composer_a, inner_composer_b, outer_composer);
+        auto circuit_output_a = create_outer_circuit(inner_composer_a, mid_composer_a);
+
+        uint256_t a0 = circuit_output_a.aggregation_state.P0.x.binary_basis_limbs[1].element.get_value();
+        uint256_t a1 = circuit_output_a.aggregation_state.P0.y.binary_basis_limbs[1].element.get_value();
+        uint256_t a2 = circuit_output_a.aggregation_state.P1.x.binary_basis_limbs[1].element.get_value();
+        uint256_t a3 = circuit_output_a.aggregation_state.P1.y.binary_basis_limbs[1].element.get_value();
+
+        ASSERT(a0.get_msb() <= 68);
+        ASSERT(a1.get_msb() <= 68);
+        ASSERT(a2.get_msb() <= 68);
+        ASSERT(a3.get_msb() <= 68);
+
+        circuit_output_a.aggregation_state.assign_object_to_proof_outputs();
+
+        auto circuit_output_b = create_outer_circuit(inner_composer_b, mid_composer_b);
+
+        circuit_output_b.aggregation_state.assign_object_to_proof_outputs();
+
+        auto circuit_output = create_double_outer_circuit(mid_composer_a, mid_composer_b, outer_composer);
+
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         g1::affine_element P[2];
         P[0].x = barretenberg::fq(circuit_output.aggregation_state.P0.x.get_value().lo);
@@ -398,8 +423,8 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         barretenberg::fq12 inner_proof_result = barretenberg::pairing::reduced_ate_pairing_batch_precomputed(
             P, circuit_output.verification_key->reference_string->get_precomputed_g2_lines(), 2);
 
-        EXPECT_EQ(circuit_output.aggregation_state.public_inputs[0].get_value(), inner_inputs[0]);
-        EXPECT_EQ(circuit_output.aggregation_state.public_inputs[1].get_value(), inner_inputs[1]);
+        EXPECT_EQ(circuit_output_a.aggregation_state.public_inputs[0].get_value(), inner_inputs[0]);
+        EXPECT_EQ(circuit_output_a.aggregation_state.public_inputs[1].get_value(), inner_inputs[1]);
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
@@ -456,7 +481,6 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
         printf("composer gates = %zu\n", outer_composer.get_num_gates());
-
         auto prover = outer_composer.create_prover();
 
         auto verifier = outer_composer.create_verifier();
@@ -638,14 +662,23 @@ HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition)
 
 HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition_ultra_no_tables)
 {
-    TestFixture::test_recursive_proof_composition_ultra_no_tables();
+    if constexpr (TypeParam::type == ComposerType::PLOOKUP) {
+        TestFixture::test_recursive_proof_composition_ultra_no_tables();
+    } else {
+        // no point running this if we're not in UltraPlonk
+        GTEST_SKIP();
+    }
 };
 
-// CircleCI can't cope with this.
-// HEAVY_TYPED_TEST(stdlib_verifier, double_verification)
-// {
-//     TestFixture::test_double_verification();
-// };
+HEAVY_TYPED_TEST(stdlib_verifier, double_verification)
+{
+    if constexpr (TypeParam::type == ComposerType::PLOOKUP) {
+        TestFixture::test_double_verification();
+    } else {
+        // CircleCI can't cope with non-ultraplonk version.
+        GTEST_SKIP();
+    }
+};
 
 HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition_with_variable_verification_key_a)
 {
diff --git a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
index 1f4410148b4f..36c128f3ea0a 100644
--- a/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
@@ -188,7 +188,7 @@ template <typename OuterComposer> class stdlib_verifier_turbo : public testing::
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
         std::cout << "creating prover" << std::endl;
diff --git a/barretenberg/cpp/src/barretenberg/transcript/transcript.cpp b/barretenberg/cpp/src/barretenberg/transcript/transcript.cpp
index 9c448949c91e..8f597c454b6a 100644
--- a/barretenberg/cpp/src/barretenberg/transcript/transcript.cpp
+++ b/barretenberg/cpp/src/barretenberg/transcript/transcript.cpp
@@ -46,10 +46,12 @@ std::array<uint8_t, Keccak256Hasher::PRNG_OUTPUT_SIZE> Keccak256Hasher::hash(std
 
 std::array<uint8_t, Blake3sHasher::PRNG_OUTPUT_SIZE> Blake3sHasher::hash(std::vector<uint8_t> const& buffer)
 {
-    std::vector<uint8_t> hash_result = blake3::blake3s(buffer);
+    grumpkin::fq input = grumpkin::fq::serialize_from_buffer(&buffer[0]);
+    grumpkin::fq compressed = crypto::pedersen_commitment::compress_native({ input });
+    std::vector<uint8_t> res = to_buffer(compressed);
     std::array<uint8_t, PRNG_OUTPUT_SIZE> result;
     for (size_t i = 0; i < PRNG_OUTPUT_SIZE; ++i) {
-        result[i] = hash_result[i];
+        result[i] = res[i];
     }
     return result;
 }
@@ -59,10 +61,12 @@ std::array<uint8_t, Blake3sHasher::PRNG_OUTPUT_SIZE> Blake3sHasher::hash_plookup
     // TODO(@zac-williamson) Change to call a Poseidon hash and create a PoseidonHasher
     // (not making the name change right now as it will break concurrent work w. getting recursion working in Noir)
     // We also need to implement a Poseidon gadget
-    std::vector<uint8_t> compressed_buffer = crypto::pedersen_commitment::lookup::compress_native(buffer);
+    grumpkin::fq input = grumpkin::fq::serialize_from_buffer(&buffer[0]);
+    grumpkin::fq compressed = crypto::pedersen_commitment::lookup::compress_native({ input });
+    std::vector<uint8_t> res = to_buffer(compressed);
     std::array<uint8_t, PRNG_OUTPUT_SIZE> result;
     for (size_t i = 0; i < PRNG_OUTPUT_SIZE; ++i) {
-        result[i] = compressed_buffer[i];
+        result[i] = res[i];
     }
     return result;
 }
@@ -259,7 +263,11 @@ void Transcript::apply_fiat_shamir(const std::string& challenge_name /*, const b
     }
 
     std::vector<uint8_t> rolling_buffer(base_hash.begin(), base_hash.end());
-    rolling_buffer.push_back(0);
+    if (hasher == HashType::Keccak256) {
+        rolling_buffer.push_back(0);
+    } else {
+        rolling_buffer[31] = (0);
+    }
 
     // Compute how many hashes we need so that we have enough distinct chunks of 'random' bytes to distribute
     // across the num_challenges.
diff --git a/barretenberg/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol b/barretenberg/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
index 4b118cdef605..748d20fcb909 100644
--- a/barretenberg/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
+++ b/barretenberg/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
@@ -1,72 +1,72 @@
-// Verification Key Hash: 507de35addf16b79526d713259492d5d1764fdb6ce55ff4ccb03c147b72f381a
+// Verification Key Hash: b665bc769f274feb94ea7f9997fa684b414aa8b9b9bac0227c7ce2e1cbd3d115
 // SPDX-License-Identifier: Apache-2.0
 // Copyright 2022 Aztec
 pragma solidity >=0.8.4;
 
 library RecursiveUltraVerificationKey {
     function verificationKeyHash() internal pure returns (bytes32) {
-        return 0x507de35addf16b79526d713259492d5d1764fdb6ce55ff4ccb03c147b72f381a;
+        return 0xb665bc769f274feb94ea7f9997fa684b414aa8b9b9bac0227c7ce2e1cbd3d115;
     }
 
     function loadVerificationKey(uint256 _vk, uint256 _omegaInverseLoc) internal pure {
         assembly {
-            mstore(add(_vk, 0x00), 0x0000000000000000000000000000000000000000000000000000000000080000) // vk.circuit_size
+            mstore(add(_vk, 0x00), 0x0000000000000000000000000000000000000000000000000000000000040000) // vk.circuit_size
             mstore(add(_vk, 0x20), 0x0000000000000000000000000000000000000000000000000000000000000010) // vk.num_inputs
-            mstore(add(_vk, 0x40), 0x2260e724844bca5251829353968e4915305258418357473a5c1d597f613f6cbd) // vk.work_root
-            mstore(add(_vk, 0x60), 0x3064486657634403844b0eac78ca882cfd284341fcb0615a15cfcd17b14d8201) // vk.domain_inverse
-            mstore(add(_vk, 0x80), 0x18fe72968b540c1dad6c7648fcb3407edfc489d8dcf3fdce314c1f0e72684c43) // vk.Q1.x
-            mstore(add(_vk, 0xa0), 0x16f49263ee016852edfed2e84bf44c22b31064b9034b62059329b2af2f349c37) // vk.Q1.y
-            mstore(add(_vk, 0xc0), 0x1c382676d0f8e5691def3a60d533850f573c36aa200ab364c091acc4a7eb094f) // vk.Q2.x
-            mstore(add(_vk, 0xe0), 0x17c05ca7ea679681a3cf772fabcf2c1a988e39910f1ba8de3d1f68ffb0effda1) // vk.Q2.y
-            mstore(add(_vk, 0x100), 0x257d75dead2d8cbb2f63b3592a762a2c2dbe0195a533736fd01982370e768676) // vk.Q3.x
-            mstore(add(_vk, 0x120), 0x258b6d74446f5e532bce6e1a62372a82986eac9801c13a8553f373c30398a47c) // vk.Q3.y
-            mstore(add(_vk, 0x140), 0x290ff6a808f6abe7508a8c884ea0fc2f819e23a5b6d7c2dd1105da2a3f0637e0) // vk.Q4.x
-            mstore(add(_vk, 0x160), 0x2e6c3c419be44ed56b61069a06e980360f58830ad52b38bb69de92c456ebf0ca) // vk.Q4.y
-            mstore(add(_vk, 0x180), 0x282e6e14bbedfc7ef013feb4877ce9098389abfd3ad8899c957be4fdb20d0454) // vk.Q_M.x
-            mstore(add(_vk, 0x1a0), 0x2483d06975c3965d3f2d205ddeff196b90ca5883878bffc0bd190a357fee947e) // vk.Q_M.y
-            mstore(add(_vk, 0x1c0), 0x09af8fed71838d47b0052d8e3fdda11f55c62a6f2cb9aab24edd90b5e9640e9c) // vk.Q_C.x
-            mstore(add(_vk, 0x1e0), 0x2bdf7549fa146188dd750d032d9dec911c5799ca99f72405c4ac49f3f9e3a51a) // vk.Q_C.y
-            mstore(add(_vk, 0x200), 0x1479a535c87c413301d82c5ae1598b46c03117a57b878416d1143bb48f1df8bf) // vk.Q_ARITHMETIC.x
-            mstore(add(_vk, 0x220), 0x03203e3c02cc68282d93507d0ad9d56304d5a4b2908233bcb6f8682f8b264532) // vk.Q_ARITHMETIC.y
-            mstore(add(_vk, 0x240), 0x0cccd1de3f4ef2a2bfffbb7a91f8be2c49e9dc9b565ba4312015a88558f40d20) // vk.QSORT.x
-            mstore(add(_vk, 0x260), 0x092c5bd4edb996d6c1189a2682f6e93ede4b9aff7f07823605c894f833316718) // vk.QSORT.y
-            mstore(add(_vk, 0x280), 0x20089848d81ee4e8d7700679e7b5ed017916e2ee28bf76c0e0f4862274637bb8) // vk.Q_ELLIPTIC.x
-            mstore(add(_vk, 0x2a0), 0x0faae100924d24a70708e49a08ba2ba9df261088bf04e7b4c3f811cc0d8995fe) // vk.Q_ELLIPTIC.y
-            mstore(add(_vk, 0x2c0), 0x2de71f46452329536fe14dfff808692c405b9ef1ae47c451be8383ded868af5c) // vk.Q_AUX.x
-            mstore(add(_vk, 0x2e0), 0x0a520e2f877f19cc69aad2396bf741e6864a9f0b657887e80165b794f7612e71) // vk.Q_AUX.y
-            mstore(add(_vk, 0x300), 0x2779b1b7b8433eeee7333a1372feb4587da74e2c93cc54917e201748ed847204) // vk.SIGMA1.x
-            mstore(add(_vk, 0x320), 0x2198823f66ad59612f6cb77aff9437388abdbcc4d8f6eac792d8bca7d1b341d9) // vk.SIGMA1.y
-            mstore(add(_vk, 0x340), 0x1f6732b9d128931b2e32b2cae73b029720cca3cef23fee25363d520ed0ba3f92) // vk.SIGMA2.x
-            mstore(add(_vk, 0x360), 0x15fb336844e68b08361c10b83e7d6ea0f011958774e58e5f7c43e6606e989ecc) // vk.SIGMA2.y
-            mstore(add(_vk, 0x380), 0x0984b1b6c723afb4713656abf30b06e2ad04c054dd3acf016a6db1ee7111ca11) // vk.SIGMA3.x
-            mstore(add(_vk, 0x3a0), 0x03421d01f19c6b91e477648819f57d888b3b23b67599266293bddf91a2636ff1) // vk.SIGMA3.y
-            mstore(add(_vk, 0x3c0), 0x2f77cda90d366b151b17c5667f10526ab0fe144aecb307e00ede6039365bcfa0) // vk.SIGMA4.x
-            mstore(add(_vk, 0x3e0), 0x0d1e8f758babcbbf134dfe341c262ee25d0254cba8f5487ad5bddd190f27a9e8) // vk.SIGMA4.y
-            mstore(add(_vk, 0x400), 0x2f61a890b9f1dff4ef5c8b0eafe9b71c7a23dc4c8a6791d9c01418310f4a7b2e) // vk.TABLE1.x
-            mstore(add(_vk, 0x420), 0x07c8a51d1881fcdfe1cb7dcefc48a44047c7f5386797d5f8553ce2e12e8daba0) // vk.TABLE1.y
-            mstore(add(_vk, 0x440), 0x1adf56913dea23b7b14c952933b0b40fc476dc2697a758ec9df73802b0596c2f) // vk.TABLE2.x
-            mstore(add(_vk, 0x460), 0x212a1759e19285a35a70a245cca6477f89b6f156e4425cf52cfccb4594f59152) // vk.TABLE2.y
-            mstore(add(_vk, 0x480), 0x1527f8c19085ac209ebddbccae4dd0ca58b078e56fd20d651ce3a3194697b191) // vk.TABLE3.x
-            mstore(add(_vk, 0x4a0), 0x02247dca9c3cb09318aa6100a2a7c628281c69bc41cfda34aa72c263b69344b4) // vk.TABLE3.y
-            mstore(add(_vk, 0x4c0), 0x12eea56d2ada3befa5db215ea5ebbd37b5ce95fcd1cf7adb94d5a1784876b4f7) // vk.TABLE4.x
-            mstore(add(_vk, 0x4e0), 0x190df1146fbdd5cc79e8817ebcd6311e35cf5cc38795cee26371a707d685e05a) // vk.TABLE4.y
-            mstore(add(_vk, 0x500), 0x019b3a1970f9f77b13538cd8071ea3ee7c556fd98009e2a04be044ead0a94623) // vk.TABLE_TYPE.x
-            mstore(add(_vk, 0x520), 0x159cbdae3e194fe45524a171befdcb98b55c8d495fc463c98ac690eee947119f) // vk.TABLE_TYPE.y
-            mstore(add(_vk, 0x540), 0x16b2f7fa29f578aae3d4c0b8220101570adfcc9e8aa8a148267208540de189f1) // vk.ID1.x
-            mstore(add(_vk, 0x560), 0x2344a211fbbacc281de980197e4f12155d90d55a67f4ad08398bac665f813953) // vk.ID1.y
-            mstore(add(_vk, 0x580), 0x1af709df675db1688b95927324e71c5e551436ba7cb32478570a9cfaebf90614) // vk.ID2.x
-            mstore(add(_vk, 0x5a0), 0x2b83e76f61aa5cd70218c38e693ae0a99e9a2f4a192af5c77dbd27fa605fdae4) // vk.ID2.y
-            mstore(add(_vk, 0x5c0), 0x038c89635a8b6ec9766d5f98d13c16f8c312088f830610de72c00edf8c3b7800) // vk.ID3.x
-            mstore(add(_vk, 0x5e0), 0x1863d9217ba6c6764fa02298efe25fabfbe454a27431b970a6afff5d1986fadb) // vk.ID3.y
-            mstore(add(_vk, 0x600), 0x259a5dd47d44d6240407c26718201a122fb4b6b38d838f6e24d1c75515016761) // vk.ID4.x
-            mstore(add(_vk, 0x620), 0x14db344b735ffe084107e5cea07b00e4c41a82f0073f76e0536cd7118d78866f) // vk.ID4.y
+            mstore(add(_vk, 0x40), 0x19ddbcaf3a8d46c15c0176fbb5b95e4dc57088ff13f4d1bd84c6bfa57dcdc0e0) // vk.work_root
+            mstore(add(_vk, 0x60), 0x30644259cd94e7dd5045d7a27013b7fcd21c9e3b7fa75222e7bda49b729b0401) // vk.domain_inverse
+            mstore(add(_vk, 0x80), 0x16f7fc6133c8fb2dab06c57392df697a53357ecd918d749d1c981dcd0ee6d849) // vk.Q1.x
+            mstore(add(_vk, 0xa0), 0x2ba047103f9f86b84058d718a082e2faa53e50109e7cb880d2cbb7a1bf98da89) // vk.Q1.y
+            mstore(add(_vk, 0xc0), 0x1b9d146737dbb7759e0cad93ad4a7669880a062aceb7b46b8485327976d7285c) // vk.Q2.x
+            mstore(add(_vk, 0xe0), 0x11de7c3d638acc90e7f844c08658d0588da864268e00576d26aaca3cf49af350) // vk.Q2.y
+            mstore(add(_vk, 0x100), 0x1466840d8ad2dfde3a55d4c98412a05807bbe8aac33c27ba100c1e621fbebba0) // vk.Q3.x
+            mstore(add(_vk, 0x120), 0x2198ce44955b8ac6e21ddcbb66acd9df7596ad9e5fcf22f2227e8bbb51fe44ee) // vk.Q3.y
+            mstore(add(_vk, 0x140), 0x18b96a49db3644e2986f811b8c104e8eb88aa5eb9aec0ca109322a64885688bd) // vk.Q4.x
+            mstore(add(_vk, 0x160), 0x2ffec963826849cabd279a2b9f9a26f81518eb65d882f47a32470fc52f53def0) // vk.Q4.y
+            mstore(add(_vk, 0x180), 0x09dd725897471fddc177b241d7abc402705acfa452707388fa62666ad454598c) // vk.Q_M.x
+            mstore(add(_vk, 0x1a0), 0x03a46eb7ed69136e109e2761fb707da7cee18b3d05e581f24d77853b3b03581e) // vk.Q_M.y
+            mstore(add(_vk, 0x1c0), 0x304db51670cb2c59e3088431803e82bce8c81b38eefa267871ae2103ca7842ca) // vk.Q_C.x
+            mstore(add(_vk, 0x1e0), 0x1d7ec7d8d4a74e337de26b7adaecb8beb03d8cd647aa180bc08de840038710d5) // vk.Q_C.y
+            mstore(add(_vk, 0x200), 0x1db65122bf0f0a58fe07bd7342d3e26b07923041cb7d2158d13fb7b5328da40e) // vk.Q_ARITHMETIC.x
+            mstore(add(_vk, 0x220), 0x1691db1eeedbcb4f7646959cf363c00b7e26812a225edf5a6972d815270770f5) // vk.Q_ARITHMETIC.y
+            mstore(add(_vk, 0x240), 0x2a63b6a306e30d87f4b8597cbd1dcecff5fc7cacb774247fca6531e3d347ada4) // vk.QSORT.x
+            mstore(add(_vk, 0x260), 0x2849d2901fcd1f048924fb77e9451ad45d80f9f842418146b1fde0a7c752fc5f) // vk.QSORT.y
+            mstore(add(_vk, 0x280), 0x0e42866979ddac27ac729352dd0f844da4fb5a1c3e2480b5b940acd12304c700) // vk.Q_ELLIPTIC.x
+            mstore(add(_vk, 0x2a0), 0x017ac9a40547e866bdb914dc2b73661c0ec8aa67956c8c9bf406795f75e15c53) // vk.Q_ELLIPTIC.y
+            mstore(add(_vk, 0x2c0), 0x1ad08199bf79952adff0aa3a9c04a26f18ad7deed1fbed0548f2c83ddf913ef9) // vk.Q_AUX.x
+            mstore(add(_vk, 0x2e0), 0x151df9277b110c615c058f7f783105d03cab938f23884afed1897d0049715d21) // vk.Q_AUX.y
+            mstore(add(_vk, 0x300), 0x0bd26d62138b721fdc08fd7d52cd3dfaa37399eb416af0ec6237f9ec1a63a5c0) // vk.SIGMA1.x
+            mstore(add(_vk, 0x320), 0x103282cd2ef4210ac390d70a1cba58c6792a5d872ae0337615f8ac9997d300ef) // vk.SIGMA1.y
+            mstore(add(_vk, 0x340), 0x08abaa91c69ffa73d80d9a9562020c2a104771f07cf4099cbbe9a0071befb1cc) // vk.SIGMA2.x
+            mstore(add(_vk, 0x360), 0x1a82e5cd4a2c3de77afb2ca76c89b54991a4db3939a5c24806af01a0f69a2366) // vk.SIGMA2.y
+            mstore(add(_vk, 0x380), 0x26d50e2d19c429d1a2987d5249b88e388f93339fc05f52939fa2e1f4be653918) // vk.SIGMA3.x
+            mstore(add(_vk, 0x3a0), 0x0a49cd57e79633ea43cc3172e819327ce260682d8b571d0964678a153c17e959) // vk.SIGMA3.y
+            mstore(add(_vk, 0x3c0), 0x1c82f3e7c57b08ef90fda6fe39427b815a835c8559b64eac0a4b213998f6802c) // vk.SIGMA4.x
+            mstore(add(_vk, 0x3e0), 0x098bad014a270b6f5e4c90cbd299c15c5fd190457f0e78a5f849243e86688868) // vk.SIGMA4.y
+            mstore(add(_vk, 0x400), 0x215a055ec0bf7d7ab5e005b4260258aaadfd8ae9005a09060fdd0cee02dc3fea) // vk.TABLE1.x
+            mstore(add(_vk, 0x420), 0x1841eba177a34b1eb908727fe2e54bf33fc82b6e58dfd044acd4ba05ca80c837) // vk.TABLE1.y
+            mstore(add(_vk, 0x440), 0x018eb037682044ebf9cad76f777bf379b94c4d31d4351ce9677ff146a744555c) // vk.TABLE2.x
+            mstore(add(_vk, 0x460), 0x2bf87d72f0aef257c728503c900516f9274ab06eb54804651218438e40f06c25) // vk.TABLE2.y
+            mstore(add(_vk, 0x480), 0x13b003b384fb50e00994bf62a0057f44344be47383d59a7e9f1319d710ab5263) // vk.TABLE3.x
+            mstore(add(_vk, 0x4a0), 0x1a5f338a3d05fb46ea46855e6c36dbdb23c5f20a56acc795324fe2958189ec39) // vk.TABLE3.y
+            mstore(add(_vk, 0x4c0), 0x1365fd683dbad2c4c55b02dd33c4b96fde00e5bb3f52be20ead95484e130aee1) // vk.TABLE4.x
+            mstore(add(_vk, 0x4e0), 0x2da2ba1d27548e452cc863758acf156eb268f577b7d08ba58e7bbf2d28f6f23c) // vk.TABLE4.y
+            mstore(add(_vk, 0x500), 0x0ef908712f03ce2e4db3ef557abbde7c584d8c831165ba40ab43124526c53cc1) // vk.TABLE_TYPE.x
+            mstore(add(_vk, 0x520), 0x009dd642bc5eb1869048b59d2052645208cc5a14537814568d9c985c93319e55) // vk.TABLE_TYPE.y
+            mstore(add(_vk, 0x540), 0x0f973c9af1150675ae6dac1ea8ea366e5b8db13bb9c2237ab11c40dfb644ebf5) // vk.ID1.x
+            mstore(add(_vk, 0x560), 0x06b0c966f9edab490ac15a176d35d56996cc66854268197989a53ab0d1368188) // vk.ID1.y
+            mstore(add(_vk, 0x580), 0x09e719130bb46416efa070d08d82cc07fe0ed3bd8685616b92b4b9619e0807b2) // vk.ID2.x
+            mstore(add(_vk, 0x5a0), 0x18f35ee01438dda2443da27299404d09ccfff098a0ceac2e9a10bf2a96bc11ac) // vk.ID2.y
+            mstore(add(_vk, 0x5c0), 0x0cb835c737d324b9ff5bba45988dc4921104803b7e37649f8c628f0de26361ac) // vk.ID3.x
+            mstore(add(_vk, 0x5e0), 0x18ca0ac87859387aa32c6939f7a4a0d322879a3fdb1ef85d06addcddc13acea5) // vk.ID3.y
+            mstore(add(_vk, 0x600), 0x0047304b09efd9315a96d9e802c9a50c1964076026e5f17aff825d6cfc38d823) // vk.ID4.x
+            mstore(add(_vk, 0x620), 0x21c9f3aa4cbe8ee21422052f7c22d3d8a5a9a89c262a5a5cb52d8802f6106c49) // vk.ID4.y
             mstore(add(_vk, 0x640), 0x01) // vk.contains_recursive_proof
             mstore(add(_vk, 0x660), 0) // vk.recursive_proof_public_input_indices
             mstore(add(_vk, 0x680), 0x260e01b251f6f1c7e7ff4e580791dee8ea51d87a358e038b4efe30fac09383c1) // vk.g2_x.X.c1
             mstore(add(_vk, 0x6a0), 0x0118c4d5b837bcc2bc89b5b398b5974e9f5944073b32078b7e231fec938883b0) // vk.g2_x.X.c0
             mstore(add(_vk, 0x6c0), 0x04fc6369f7110fe3d25156c1bb9a72859cf2a04641f99ba4ee413c80da6a5fe4) // vk.g2_x.Y.c1
             mstore(add(_vk, 0x6e0), 0x22febda3c0c0632a56475b4214e5615e11e6dd3f96e6cea2854a87d4dacc5e55) // vk.g2_x.Y.c0
-            mstore(_omegaInverseLoc, 0x06e402c0a314fb67a15cf806664ae1b722dbc0efe66e6c81d98f9924ca535321) // vk.work_root_inverse
+            mstore(_omegaInverseLoc, 0x036853f083780e87f8d7c71d111119c57dbe118c22d5ad707a82317466c5174c) // vk.work_root_inverse
         }
     }
 }