feat: Parallel IPA (#3882)

This PR: 1. Adds a new function run_loop_in_parallel to thread.hpp so that it's easier to use parallelism in most cases without redoing the computation for splitting the workload into chunks every time. 2. Uses this new functionality to parallelise logic in IPA open and verify procedures (and used methods) (x20-x30 improvement for ECCVM proving) 3. Fixes an error in using one of the vectors in IPA opening procedure which led to an additional nlogn complexity. 4. Adds an IPA opening and verification benchmark
AztecProtocol · Jan 12, 2024 · 6952636 · 6952636
1 parent 571ae25
commit 6952636
Show file tree

Hide file tree

Showing 8 changed files with 419 additions and 144 deletions.
diff --git a/cpp/src/barretenberg/benchmark/CMakeLists.txt b/cpp/src/barretenberg/benchmark/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(ipa_bench)
 add_subdirectory(decrypt_bench)
 add_subdirectory(pippenger_bench)
 add_subdirectory(plonk_bench)

diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt b/cpp/src/barretenberg/benchmark/ipa_bench/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Each source represents a separate benchmark suite 
+set(BENCHMARK_SOURCES
+ ipa.bench.cpp
+)
+
+# Required libraries for benchmark suites
+set(LINKED_LIBRARIES
+  benchmark::benchmark
+  ultra_honk
+)
+
+# Add executable and custom target for each suite, e.g. ultra_honk_bench
+foreach(BENCHMARK_SOURCE ${BENCHMARK_SOURCES})
+  get_filename_component(BENCHMARK_NAME ${BENCHMARK_SOURCE} NAME_WE) # extract name without extension
+  add_executable(${BENCHMARK_NAME}_bench ${BENCHMARK_SOURCE})
+  target_link_libraries(${BENCHMARK_NAME}_bench ${LINKED_LIBRARIES})
+  add_custom_target(run_${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
diff --git a/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp b/cpp/src/barretenberg/benchmark/ipa_bench/ipa.bench.cpp
@@ -0,0 +1,73 @@
+#include "barretenberg/commitment_schemes/ipa/ipa.hpp"
+#include <benchmark/benchmark.h>
+
+using namespace benchmark;
+using namespace barretenberg;
+using namespace proof_system;
+using namespace proof_system::honk::pcs::ipa;
+namespace {
+using Curve = curve::Grumpkin;
+using Fr = Curve::ScalarField;
+using IPA = IPA<Curve>;
+using OpeningPair = honk::pcs::OpeningPair<Curve>;
+using OpeningClaim = honk::pcs::OpeningClaim<Curve>;
+using Polynomial = Polynomial<Curve::ScalarField>;
+using CommitmentKey = honk::pcs::CommitmentKey<Curve>;
+using VerifierCommitmentKey = honk::pcs::VerifierCommitmentKey<Curve>;
+
+constexpr size_t MIN_POLYNOMIAL_DEGREE_LOG2 = 10;
+constexpr size_t MAX_POLYNOMIAL_DEGREE_LOG2 = 16;
+std::shared_ptr<barretenberg::srs::factories::CrsFactory<curve::Grumpkin>> crs_factory(
+    new barretenberg::srs::factories::FileCrsFactory<curve::Grumpkin>("../srs_db/grumpkin", 1 << 16));
+
+auto ck = std::make_shared<CommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+auto vk = std::make_shared<VerifierCommitmentKey>(1 << MAX_POLYNOMIAL_DEGREE_LOG2, crs_factory);
+
+std::vector<std::shared_ptr<honk::BaseTranscript>> prover_transcripts(MAX_POLYNOMIAL_DEGREE_LOG2 -
+                                                                      MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+std::vector<OpeningClaim> opening_claims(MAX_POLYNOMIAL_DEGREE_LOG2 - MIN_POLYNOMIAL_DEGREE_LOG2 + 1);
+
+void ipa_open(State& state) noexcept
+{
+    numeric::random::Engine& engine = numeric::random::get_debug_engine();
+    for (auto _ : state) {
+        state.PauseTiming();
+        size_t n = 1 << static_cast<size_t>(state.range(0));
+        // Construct the polynomial
+        Polynomial poly(n);
+        for (size_t i = 0; i < n; ++i) {
+            poly[i] = Fr::random_element(&engine);
+        }
+        auto x = Fr::random_element(&engine);
+        auto eval = poly.evaluate(x);
+        const OpeningPair opening_pair = { x, eval };
+        const OpeningClaim opening_claim{ opening_pair, ck->commit(poly) };
+        // initialize empty prover transcript
+        auto prover_transcript = std::make_shared<honk::BaseTranscript>();
+        state.ResumeTiming();
+        // Compute proof
+        IPA::compute_opening_proof(ck, opening_pair, poly, prover_transcript);
+        // Store info for verifier
+        prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = prover_transcript;
+        opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2] = opening_claim;
+    }
+}
+void ipa_verify(State& state) noexcept
+{
+    for (auto _ : state) {
+        state.PauseTiming();
+        // Retrieve proofs
+        auto prover_transcript = prover_transcripts[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        auto opening_claim = opening_claims[static_cast<size_t>(state.range(0)) - MIN_POLYNOMIAL_DEGREE_LOG2];
+        // initialize verifier transcript from proof data
+        auto verifier_transcript = std::make_shared<honk::BaseTranscript>(prover_transcript->proof_data);
+
+        state.ResumeTiming();
+        auto result = IPA::verify(vk, opening_claim, verifier_transcript);
+        ASSERT(result);
+    }
+}
+} // namespace
+BENCHMARK(ipa_open)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK(ipa_verify)->Unit(kMillisecond)->DenseRange(MIN_POLYNOMIAL_DEGREE_LOG2, MAX_POLYNOMIAL_DEGREE_LOG2);
+BENCHMARK_MAIN();
diff --git a/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -52,46 +52,67 @@ template <typename Curve> class IPA {
         auto a_vec = polynomial;
         auto srs_elements = ck->srs->get_monomial_points();
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         std::vector<Fr> b_vec(poly_degree);
-        Fr b_power = 1;
-        for (size_t i = 0; i < poly_degree; i++) {
-            b_vec[i] = b_power;
-            b_power *= opening_pair.challenge;
-        }
+        run_loop_in_parallel(
+            poly_degree,
+            [&b_vec, &opening_pair](size_t start, size_t end) {
+                Fr b_power = opening_pair.challenge.pow(start);
+                for (size_t i = start; i < end; i++) {
+                    b_vec[i] = b_power;
+                    b_power *= opening_pair.challenge;
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_degree));
         std::vector<GroupElement> L_elements(log_poly_degree);
         std::vector<GroupElement> R_elements(log_poly_degree);
         std::size_t round_size = poly_degree;
 
-        // TODO(#479): restructure IPA so it can be integrated with the pthread alternative to work queue (or even the
-        // work queue itself). Investigate whether parallelising parts of each rounds of IPA rounds brings significant
-        // improvements and see if reducing the size of G_vec_local and b_vec by taking the first iteration out of the
-        // loop can also be integrated.
+        // Perform IPA rounds
         for (size_t i = 0; i < log_poly_degree; i++) {
             round_size >>= 1;
             // Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
+            std::mutex addition_lock;
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
-            for (size_t j = 0; j < round_size; j++) {
-                inner_prod_L += a_vec[j] * b_vec[round_size + j];
-                inner_prod_R += a_vec[round_size + j] * b_vec[j];
-            }
+            // Run scalar product in parallel
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
+                    Fr current_inner_prod_L = Fr::zero();
+                    Fr current_inner_prod_R = Fr::zero();
+                    for (size_t j = start; j < end; j++) {
+                        current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
+                        current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
+                    }
+                    addition_lock.lock();
+                    inner_prod_L += current_inner_prod_L;
+                    inner_prod_R += current_inner_prod_R;
+                    addition_lock.unlock();
+                },
+                /*no_multhreading_if_less_or_equal=*/8);
+
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
-            L_elements[i] =
-                // TODO(#473)
-                barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
-                    &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
+            L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
+                &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state);
             L_elements[i] += aux_generator * inner_prod_L;
 
             // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator
-            // TODO(#473)
             R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
                 &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state);
             R_elements[i] += aux_generator * inner_prod_R;
@@ -104,23 +125,32 @@ template <typename Curve> class IPA {
             const Fr round_challenge = transcript->get_challenge("IPA:round_challenge_" + index);
             const Fr round_challenge_inv = round_challenge.invert();
 
-            std::vector<Commitment> G_lo(G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size));
-            std::vector<Commitment> G_hi(G_vec_local.begin() + static_cast<long>(round_size), G_vec_local.end());
-            G_lo = GroupElement::batch_mul_with_endomorphism(G_lo, round_challenge_inv);
-            G_hi = GroupElement::batch_mul_with_endomorphism(G_hi, round_challenge);
+            auto G_lo = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                round_challenge_inv);
+            auto G_hi = GroupElement::batch_mul_with_endomorphism(
+                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
+                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                round_challenge);
 
             // Update the vectors a_vec, b_vec and G_vec.
             // a_vec_next = a_vec_lo * round_challenge + a_vec_hi * round_challenge_inv
             // b_vec_next = b_vec_lo * round_challenge_inv + b_vec_hi * round_challenge
             // G_vec_next = G_vec_lo * round_challenge_inv + G_vec_hi * round_challenge
-            for (size_t j = 0; j < round_size; j++) {
-                a_vec[j] *= round_challenge;
-                a_vec[j] += round_challenge_inv * a_vec[round_size + j];
-                b_vec[j] *= round_challenge_inv;
-                b_vec[j] += round_challenge * b_vec[round_size + j];
-
-                G_vec_local[j] = G_lo[j] + G_hi[j];
-            }
+            run_loop_in_parallel(
+                round_size,
+                [&a_vec, &b_vec, &G_vec_local, &G_lo, &G_hi, round_challenge, round_challenge_inv, round_size](
+                    size_t start, size_t end) {
+                    for (size_t j = start; j < end; j++) {
+                        a_vec[j] *= round_challenge;
+                        a_vec[j] += round_challenge_inv * a_vec[round_size + j];
+                        b_vec[j] *= round_challenge_inv;
+                        b_vec[j] += round_challenge * b_vec[round_size + j];
+
+                        G_vec_local[j] = G_lo[j] + G_hi[j];
+                    }
+                },
+                /*no_multhreading_if_less_or_equal=*/4);
         }
 
         transcript->send_to_verifier("IPA:a_0", a_vec[0]);
@@ -166,7 +196,7 @@ template <typename Curve> class IPA {
             msm_scalars[2 * i] = round_challenges[i].sqr();
             msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr();
         }
-        // TODO(#473)
+
         GroupElement LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state);
         GroupElement C_zero = C_prime + LR_sums;
@@ -188,29 +218,42 @@ template <typename Curve> class IPA {
         // Compute G_zero
         // First construct s_vec
         std::vector<Fr> s_vec(poly_degree);
-        for (size_t i = 0; i < poly_degree; i++) {
-            Fr s_vec_scalar = Fr::one();
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                auto bit = (i >> j) & 1;
-                bool b = static_cast<bool>(bit);
-                if (b) {
-                    s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
-                } else {
-                    s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+        run_loop_in_parallel(
+            poly_degree,
+            [&s_vec, &round_challenges, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    Fr s_vec_scalar = Fr::one();
+                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+                        auto bit = (i >> j) & 1;
+                        bool b = static_cast<bool>(bit);
+                        if (b) {
+                            s_vec_scalar *= round_challenges[log_poly_degree - 1 - j];
+                        } else {
+                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
+                        }
+                    }
+                    s_vec[i] = s_vec_scalar;
                 }
-            }
-            s_vec[i] = s_vec_scalar;
-        }
+            },
+            /*no_multhreading_if_less_or_equal=*/4);
+
         auto srs_elements = vk->srs->get_monomial_points();
+
         // Copy the G_vector to local memory.
         std::vector<Commitment> G_vec_local(poly_degree);
+
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        for (size_t i = 0; i < poly_degree * 2; i += 2) {
-            G_vec_local[i >> 1] = srs_elements[i];
-        }
-        // TODO(#473)
+        run_loop_in_parallel(
+            poly_degree,
+            [&G_vec_local, srs_elements](size_t start, size_t end) {
+                for (size_t i = start * 2; i < end * 2; i += 2) {
+                    G_vec_local[i >> 1] = srs_elements[i];
+                }
+            },
+            /*no_multhreading_if_less_or_equal=*/16);
+
         auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
             &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state);
 

diff --git a/cpp/src/barretenberg/common/thread.cpp b/cpp/src/barretenberg/common/thread.cpp
@@ -86,3 +86,44 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
 #endif
 #endif
 }
+
+/**
+ * @brief Split a loop into several loops running in parallel
+ *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
+ * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
+ *
+ */
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal)
+{
+    if (num_points <= no_multhreading_if_less_or_equal) {
+        func(0, num_points);
+        return;
+    }
+    // Get number of cpus we can split into
+    const size_t num_cpus = get_num_cpus();
+
+    // Compute the size of a single chunk
+    const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
+    // Parallelize over chunks
+    parallel_for(num_cpus, [num_points, chunk_size, &func](size_t chunk_index) {
+        // If num_points is small, sometimes we need fewer CPUs
+        if (chunk_size * chunk_index > num_points) {
+            return;
+        }
+        // Compute the current chunk size (can differ in case it's the last chunk)
+        size_t current_chunk_size = std::min(num_points - (chunk_size * chunk_index), chunk_size);
+        if (current_chunk_size == 0) {
+            return;
+        }
+        size_t start = chunk_index * chunk_size;
+        size_t end = chunk_index * chunk_size + current_chunk_size;
+        func(start, end);
+    });
+};
diff --git a/cpp/src/barretenberg/common/thread.hpp b/cpp/src/barretenberg/common/thread.hpp
@@ -23,3 +23,6 @@ inline size_t get_num_cpus_pow2()
 }
 
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
+void run_loop_in_parallel(size_t num_points,
+                          const std::function<void(size_t, size_t)>& func,
+                          size_t no_multhreading_if_less_or_equal = 0);
diff --git a/cpp/src/barretenberg/ecc/groups/element.hpp b/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -92,7 +92,7 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
 
     static void batch_normalize(element* elements, size_t num_elements) noexcept;
     static std::vector<affine_element<Fq, Fr, Params>> batch_mul_with_endomorphism(
-        const std::vector<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
+        const std::span<affine_element<Fq, Fr, Params>>& points, const Fr& exponent) noexcept;
 
     Fq x;
     Fq y;