From 12cc9db4fe979f51c6702f9fa68529572a11ba05 Mon Sep 17 00:00:00 2001 From: codygunton Date: Wed, 24 May 2023 09:45:27 +0000 Subject: [PATCH] Templatize: everything builds and links --- cpp/.clangd | 2 - .../benchmark/pippenger_bench/main.cpp | 7 +- .../dsl/acir_proofs/acir_proofs.cpp | 4 +- .../barretenberg/ecc/curves/bn254/bn254.hpp | 4 +- .../bn254/scalar_multiplication/c_bind.cpp | 13 +- .../bn254/scalar_multiplication/pippenger.cpp | 44 - .../bn254/scalar_multiplication/pippenger.hpp | 62 -- .../scalar_multiplication/process_buckets.cpp | 64 -- .../scalar_multiplication/process_buckets.hpp | 12 - .../scalar_multiplication/runtime_states.cpp | 212 ---- .../scalar_multiplication/runtime_states.hpp | 98 -- .../scalar_multiplication.cpp | 946 ----------------- .../scalar_multiplication.hpp | 155 --- .../scalar_multiplication.test.cpp | 937 ----------------- .../ecc/curves/grumpkin/grumpkin.hpp | 5 +- .../grumpkin/scalar_multiplication/c_bind.cpp | 50 - .../grumpkin/scalar_multiplication/c_bind.hpp | 18 - .../scalar_multiplication/pippenger.cpp | 44 - .../scalar_multiplication/pippenger.hpp | 62 -- .../scalar_multiplication/process_buckets.cpp | 64 -- .../scalar_multiplication/process_buckets.hpp | 12 - .../scalar_multiplication/runtime_states.cpp | 212 ---- .../scalar_multiplication/runtime_states.hpp | 100 -- .../scalar_multiplication.cpp | 947 ------------------ .../scalar_multiplication.hpp | 154 --- .../scalar_multiplication.test.cpp | 946 ----------------- .../curves/scalar_multiplication/c_bind.cpp | 50 - .../curves/scalar_multiplication/c_bind.hpp | 18 - .../scalar_multiplication/pippenger.cpp | 38 +- .../scalar_multiplication/pippenger.hpp | 19 +- .../scalar_multiplication/process_buckets.cpp | 4 +- .../scalar_multiplication/process_buckets.hpp | 4 +- .../scalar_multiplication/runtime_states.cpp | 56 +- .../scalar_multiplication/runtime_states.hpp | 35 +- .../scalar_multiplication.cpp | 385 ++++--- .../scalar_multiplication.hpp | 149 ++- .../scalar_multiplication.test.cpp | 246 +++-- .../ecc/curves/secp256k1/secp256k1.hpp | 5 +- .../ecc/curves/secp256r1/secp256r1.hpp | 5 +- .../barretenberg/honk/pcs/commitment_key.hpp | 12 +- cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp | 19 +- .../honk/proof_system/ultra_verifier.cpp | 4 +- .../honk/proof_system/verifier.cpp | 2 +- .../proofs/join_split/c_bind.cpp | 3 +- .../plonk/composer/composer_base.cpp | 8 +- .../turbo_plonk_composer_helper.cpp | 2 +- .../plonk/composer/standard_composer.cpp | 2 +- .../plonk/composer/turbo_composer.cpp | 2 +- .../plonk/composer/ultra_composer.cpp | 2 +- .../plonk/proof_system/prover/prover.cpp | 2 +- .../proof_system/proving_key/proving_key.hpp | 9 +- .../plonk/proof_system/verifier/verifier.cpp | 9 +- .../proof_system/verifier/verifier.test.cpp | 12 +- .../permutation_widget_impl.hpp | 4 +- .../random_widgets/plookup_widget_impl.hpp | 2 +- .../polynomials/polynomials.bench.cpp | 47 +- .../turbo_circuit_constructor.cpp | 2 +- .../proof_system/work_queue/work_queue.cpp | 6 +- cpp/src/barretenberg/srs/io.cpp | 69 +- cpp/src/barretenberg/srs/io.hpp | 25 +- .../reference_string/env_reference_string.hpp | 4 +- .../file_reference_string.hpp | 4 +- .../reference_string/mem_reference_string.hpp | 2 +- .../pippenger_reference_string.hpp | 8 +- 64 files changed, 725 insertions(+), 5724 deletions(-) delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp delete mode 100644 cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp delete mode 100644 cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp diff --git a/cpp/.clangd b/cpp/.clangd index 599f23163a..06f5d0d059 100644 --- a/cpp/.clangd +++ b/cpp/.clangd @@ -59,8 +59,6 @@ Diagnostics: - readability-function-cognitive-complexity # It is often nicer to not be explicit - google-explicit-constructor - CheckOptions: - - cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: True --- # this divider is necessary # Disable some checks for Google Test/Bench diff --git a/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp b/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp index 40f2b15284..72325f78b3 100644 --- a/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp +++ b/cpp/src/barretenberg/benchmark/pippenger_bench/main.cpp @@ -1,7 +1,8 @@ #include #include "barretenberg/common/assert.hpp" #include -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/bn254/bn254.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/srs/reference_string/file_reference_string.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" @@ -63,9 +64,9 @@ const auto init = []() { int pippenger() { - scalar_multiplication::pippenger_runtime_state state(NUM_POINTS); + scalar_multiplication::pippenger_runtime_state state(NUM_POINTS); std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now(); - g1::element result = scalar_multiplication::pippenger_unsafe( + g1::element result = scalar_multiplication::pippenger_unsafe( &scalars[0], reference_string->get_monomial_points(), NUM_POINTS, state); std::chrono::steady_clock::time_point time_end = std::chrono::steady_clock::now(); std::chrono::microseconds diff = std::chrono::duration_cast(time_end - time_start); diff --git a/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp b/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp index 2010c592fb..a987a66d28 100644 --- a/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp +++ b/cpp/src/barretenberg/dsl/acir_proofs/acir_proofs.cpp @@ -72,7 +72,7 @@ size_t init_verification_key(void* pippenger, uint8_t const* g2x, uint8_t const* auto proving_key = std::make_shared(std::move(pk_data), crs); auto crs_factory = std::make_unique( - reinterpret_cast(pippenger), g2x); + reinterpret_cast*>(pippenger), g2x); proving_key->reference_string = crs_factory->get_prover_crs(proving_key->circuit_size); acir_format::Composer composer(proving_key, nullptr); @@ -108,7 +108,7 @@ size_t new_proof(void* pippenger, auto witness = from_buffer>(witness_buf); auto crs_factory = std::make_unique( - reinterpret_cast(pippenger), g2x); + reinterpret_cast*>(pippenger), g2x); proving_key->reference_string = crs_factory->get_prover_crs(proving_key->circuit_size); acir_format::Composer composer(proving_key, nullptr); diff --git a/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp b/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp index 55fae2d742..7302507e7e 100644 --- a/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp +++ b/cpp/src/barretenberg/ecc/curves/bn254/bn254.hpp @@ -2,13 +2,15 @@ #include "../bn254/fr.hpp" #include "../bn254/fq.hpp" #include "../bn254/g1.hpp" +#include "../bn254/g2.hpp" namespace curve { class BN254 { public: using ScalarField = barretenberg::fr; using BaseField = barretenberg::fq; - using ProjectiveElement = typename barretenberg::g1::element; + using Group = typename barretenberg::g1; + using Element = typename barretenberg::g1::element; using AffineElement = typename barretenberg::g1::affine_element; }; } // namespace curve \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp index 66803cb5d5..74275eb514 100644 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp +++ b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/c_bind.cpp @@ -1,5 +1,6 @@ -#include "scalar_multiplication.hpp" -#include "pippenger.hpp" +#include "../bn254.hpp" +#include "../../scalar_multiplication/scalar_multiplication.hpp" +#include "../../scalar_multiplication/pippenger.hpp" #include "barretenberg/common/mem.hpp" using namespace barretenberg; @@ -21,19 +22,19 @@ WASM_EXPORT void bbfree(void* ptr) WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points) { - auto ptr = new scalar_multiplication::Pippenger(points, num_points); + auto ptr = new scalar_multiplication::Pippenger(points, num_points); return ptr; } WASM_EXPORT void delete_pippenger(void* pippenger) { - delete reinterpret_cast(pippenger); + delete reinterpret_cast*>(pippenger); } WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* result_ptr) { - scalar_multiplication::pippenger_runtime_state state(range); - auto pippenger = reinterpret_cast(pippenger_ptr); + scalar_multiplication::pippenger_runtime_state state(range); + auto pippenger = reinterpret_cast*>(pippenger_ptr); auto scalars = reinterpret_cast(scalars_ptr); auto result = reinterpret_cast(result_ptr); *result = pippenger->pippenger_unsafe(scalars, from, range); diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp deleted file mode 100644 index cb8f93a6c3..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "pippenger.hpp" -#include "barretenberg/srs/io.hpp" -namespace barretenberg { -namespace scalar_multiplication { - -Pippenger::Pippenger(g1::affine_element* points, size_t num_points) - : monomials_(points) - , num_points_(num_points) -{ - io::byteswap(&monomials_[0], num_points * 64); - scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -Pippenger::Pippenger(uint8_t const* points, size_t num_points) - : num_points_(num_points) -{ - monomials_ = point_table_alloc(num_points); - - barretenberg::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64); - barretenberg::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -Pippenger::Pippenger(std::string const& path, size_t num_points) - : num_points_(num_points) -{ - monomials_ = point_table_alloc(num_points); - - barretenberg::io::read_transcript_g1(monomials_, num_points, path); - barretenberg::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range) -{ - scalar_multiplication::pippenger_runtime_state state(range); - return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state); -} - -Pippenger::~Pippenger() -{ - free(monomials_); -} - -} // namespace scalar_multiplication -} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp deleted file mode 100644 index 48a2c133f6..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#pragma once -#include "./scalar_multiplication.hpp" -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -namespace barretenberg { -namespace scalar_multiplication { - -inline size_t point_table_size(size_t num_points) -{ -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t prefetch_overflow = 16 * num_threads; - - return 2 * num_points + prefetch_overflow; -} - -template inline size_t point_table_buf_size(size_t num_points) -{ - return sizeof(T) * point_table_size(num_points); -} - -template inline T* point_table_alloc(size_t num_points) -{ - return (T*)aligned_alloc(64, point_table_buf_size(num_points)); -} - -class Pippenger { - public: - /** - * Expects points to be buffer of size as per point_table_size(). - * It expects the crs to start at points[1], and it fills in affine_one at points[0]. - * The crs undergoes a byteswap, and then the point table is generated. - */ - Pippenger(g1::affine_element* points, size_t num_points); - - Pippenger(uint8_t const* points, size_t num_points); - - Pippenger(std::string const& path, size_t num_points); - - ~Pippenger(); - - g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range); - - g1::affine_element* get_point_table() const { return monomials_; } - - size_t get_num_points() const { return num_points_; } - - private: - g1::affine_element* monomials_; - size_t num_points_; -}; - -} // namespace scalar_multiplication -} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp deleted file mode 100644 index 01f92b8673..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "process_buckets.hpp" - -#include - -namespace barretenberg { -namespace scalar_multiplication { -void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept -{ - constexpr size_t num_bits = 8; - constexpr size_t num_buckets = 1UL << num_bits; - constexpr uint32_t mask = static_cast(num_buckets) - 1U; - std::array bucket_counts{}; - - for (size_t i = 0; i < num_entries; ++i) { - bucket_counts[(keys[i] >> shift) & mask]++; - } - - std::array offsets; - std::array offsets_copy; - offsets[0] = 0; - - for (size_t i = 0; i < num_buckets - 1; ++i) { - bucket_counts[i + 1] += bucket_counts[i]; - } - for (size_t i = 1; i < num_buckets + 1; ++i) { - offsets[i] = bucket_counts[i - 1]; - } - for (size_t i = 0; i < num_buckets + 1; ++i) { - offsets_copy[i] = offsets[i]; - } - uint64_t* start = &keys[0]; - - for (size_t i = 0; i < num_buckets; ++i) { - uint64_t* bucket_start = &keys[offsets[i]]; - const uint64_t* bucket_end = &keys[offsets_copy[i + 1]]; - while (bucket_start != bucket_end) { - for (uint64_t* it = bucket_start; it < bucket_end; ++it) { - const size_t value = (*it >> shift) & mask; - const uint64_t offset = offsets[value]++; - std::iter_swap(it, start + offset); - } - bucket_start = &keys[offsets[i]]; - } - } - if (shift > 0) { - for (size_t i = 0; i < num_buckets; ++i) { - if (offsets_copy[i + 1] - offsets_copy[i] > 1) { - radix_sort(&keys[offsets_copy[i]], offsets_copy[i + 1] - offsets_copy[i], shift - 8); - } - } - } -} - -void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept -{ - const uint32_t bits_per_round = 8; - const uint32_t base = num_bits & 7; - const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8; - const uint32_t shift = total_bits - bits_per_round; - - radix_sort(wnaf_entries, num_entries, shift); -} -} // namespace scalar_multiplication -} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp deleted file mode 100644 index bde5916663..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/process_buckets.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include -#include - -namespace barretenberg { -namespace scalar_multiplication { -void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept; - -void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept; -} // namespace scalar_multiplication -} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp deleted file mode 100644 index 6e8aa5ccd8..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.cpp +++ /dev/null @@ -1,212 +0,0 @@ -#include "runtime_states.hpp" - -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" -#include "barretenberg/numeric/bitop/get_msb.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -namespace barretenberg { -namespace scalar_multiplication { - -pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points) -{ - constexpr size_t MAX_NUM_ROUNDS = 256; - num_points = num_initial_points * 2; - const size_t num_points_floor = static_cast(1ULL << (numeric::get_msb(num_points))); - const size_t num_buckets = static_cast( - 1U << barretenberg::scalar_multiplication::get_optimal_bucket_width(static_cast(num_initial_points))); -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t prefetch_overflow = 16 * num_threads; - const size_t num_rounds = - static_cast(barretenberg::scalar_multiplication::get_num_rounds(static_cast(num_points_floor))); - point_schedule = (uint64_t*)(aligned_alloc( - 64, (static_cast(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t))); - skew_table = (bool*)(aligned_alloc(64, pad(static_cast(num_points) * sizeof(bool), 64))); - point_pairs_1 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - point_pairs_2 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - scratch_space = (fq*)(aligned_alloc(64, static_cast(num_points) * sizeof(g1::affine_element))); - bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); - bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); - bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool))); - round_counts = (uint64_t*)(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t))); - - const size_t points_per_thread = static_cast(num_points) / num_threads; -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_threads; ++i) { - const size_t thread_offset = i * points_per_thread; - memset((void*)(point_pairs_1 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(point_pairs_2 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq)); - for (size_t j = 0; j < num_rounds; ++j) { - const size_t round_offset = (j * static_cast(num_points)); - memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t)); - } - memset((void*)(skew_table + thread_offset), 0, points_per_thread * sizeof(bool)); - } - - memset((void*)bucket_counts, 0, num_threads * num_buckets * sizeof(uint32_t)); - memset((void*)bit_counts, 0, num_threads * num_buckets * sizeof(uint32_t)); - memset((void*)bucket_empty_status, 0, num_threads * num_buckets * sizeof(bool)); - memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t)); -} - -pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other) -{ - point_schedule = other.point_schedule; - skew_table = other.skew_table; - point_pairs_1 = other.point_pairs_1; - point_pairs_2 = other.point_pairs_2; - scratch_space = other.scratch_space; - bit_counts = other.bit_counts; - bucket_counts = other.bucket_counts; - bucket_empty_status = other.bucket_empty_status; - round_counts = other.round_counts; - - other.point_schedule = nullptr; - other.skew_table = nullptr; - other.point_pairs_1 = nullptr; - other.point_pairs_2 = nullptr; - other.scratch_space = nullptr; - other.bit_counts = nullptr; - other.bucket_counts = nullptr; - other.bucket_empty_status = nullptr; - other.round_counts = nullptr; - - num_points = other.num_points; -} - -pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other) -{ - if (point_schedule) { - aligned_free(point_schedule); - } - - if (skew_table) { - aligned_free(skew_table); - } - - if (point_pairs_1) { - aligned_free(point_pairs_1); - } - - if (point_pairs_2) { - aligned_free(point_pairs_2); - } - - if (scratch_space) { - aligned_free(scratch_space); - } - - if (bit_counts) { - aligned_free(bit_counts); - } - - if (bucket_counts) { - aligned_free(bucket_counts); - } - - if (bucket_empty_status) { - aligned_free(bucket_empty_status); - } - - if (round_counts) { - aligned_free(round_counts); - } - - point_schedule = other.point_schedule; - skew_table = other.skew_table; - point_pairs_1 = other.point_pairs_1; - point_pairs_2 = other.point_pairs_2; - scratch_space = other.scratch_space; - bit_counts = other.bit_counts; - bucket_counts = other.bucket_counts; - bucket_empty_status = other.bucket_empty_status; - round_counts = other.round_counts; - - other.point_schedule = nullptr; - other.skew_table = nullptr; - other.point_pairs_1 = nullptr; - other.point_pairs_2 = nullptr; - other.scratch_space = nullptr; - other.bit_counts = nullptr; - other.bucket_counts = nullptr; - other.bucket_empty_status = nullptr; - other.round_counts = nullptr; - - num_points = other.num_points; - return *this; -} - -affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads, - const size_t thread_index) -{ - const size_t points_per_thread = static_cast(num_points / num_threads); - const size_t num_buckets = static_cast( - 1U << barretenberg::scalar_multiplication::get_optimal_bucket_width(static_cast(num_points) / 2)); - - scalar_multiplication::affine_product_runtime_state product_state; - - product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16); - product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16); - product_state.scratch_space = scratch_space + (thread_index * (points_per_thread / 2)); - product_state.bucket_counts = bucket_counts + (thread_index * (num_buckets)); - product_state.bit_offsets = bit_counts + (thread_index * (num_buckets)); - product_state.bucket_empty_status = bucket_empty_status + (thread_index * (num_buckets)); - return product_state; -} - -pippenger_runtime_state::~pippenger_runtime_state() -{ - if (point_schedule) { - aligned_free(point_schedule); - } - - if (skew_table) { - aligned_free(skew_table); - } - - if (point_pairs_1) { - aligned_free(point_pairs_1); - } - - if (point_pairs_2) { - aligned_free(point_pairs_2); - } - - if (scratch_space) { - aligned_free(scratch_space); - } - - if (bit_counts) { - aligned_free(bit_counts); - } - - if (bucket_counts) { - aligned_free(bucket_counts); - } - - if (bucket_empty_status) { - aligned_free(bucket_empty_status); - } - - if (round_counts) { - aligned_free(round_counts); - } -} -} // namespace scalar_multiplication -} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp deleted file mode 100644 index 14c62eb089..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp +++ /dev/null @@ -1,98 +0,0 @@ -#pragma once - -#include "../g1.hpp" - -namespace barretenberg { -// simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm. -// This is to eliminate page faults when allocating (and writing) to large tranches of memory. -namespace scalar_multiplication { -constexpr size_t get_optimal_bucket_width(const size_t num_points) -{ - if (num_points >= 14617149) { - return 21; - } - if (num_points >= 1139094) { - return 18; - } - // if (num_points >= 100000) - if (num_points >= 155975) { - return 15; - } - if (num_points >= 144834) - // if (num_points >= 100000) - { - return 14; - } - if (num_points >= 25067) { - return 12; - } - if (num_points >= 13926) { - return 11; - } - if (num_points >= 7659) { - return 10; - } - if (num_points >= 2436) { - return 9; - } - if (num_points >= 376) { - return 7; - } - if (num_points >= 231) { - return 6; - } - if (num_points >= 97) { - return 5; - } - if (num_points >= 35) { - return 4; - } - if (num_points >= 10) { - return 3; - } - if (num_points >= 2) { - return 2; - } - return 1; -} - -constexpr size_t get_num_rounds(const size_t num_points) -{ - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - return WNAF_SIZE(bits_per_bucket + 1); -} - -struct affine_product_runtime_state { - g1::affine_element* points; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; - uint32_t* bucket_counts; - uint32_t* bit_offsets; - uint64_t* point_schedule; - uint32_t num_points; - uint32_t num_buckets; - bool* bucket_empty_status; -}; - -struct pippenger_runtime_state { - uint64_t* point_schedule; - bool* skew_table; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; - uint32_t* bucket_counts; - uint32_t* bit_counts; - bool* bucket_empty_status; - uint64_t* round_counts; - uint64_t num_points; - - pippenger_runtime_state(const size_t num_initial_points); - pippenger_runtime_state(pippenger_runtime_state&& other); - pippenger_runtime_state& operator=(pippenger_runtime_state&& other); - ~pippenger_runtime_state(); - - affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index); -}; -} // namespace scalar_multiplication -} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp deleted file mode 100644 index 8ff1782f1e..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.cpp +++ /dev/null @@ -1,946 +0,0 @@ -#include "./scalar_multiplication.hpp" - -#include "barretenberg/common/throw_or_abort.hpp" -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" -#include "barretenberg/numeric/bitop/get_msb.hpp" - -#include -#include -#include -#include - -#include "../../../groups/wnaf.hpp" -#include "../fq.hpp" -#include "../fr.hpp" -#include "../g1.hpp" -#include "./process_buckets.hpp" -#include "./runtime_states.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -#define BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 16] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 17] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 18] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 19] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 20] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 21] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 22] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 23] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 24] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 25] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 26] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 27] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 28] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 29] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 30] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 31] >> 32ULL)); \ - \ - uint64_t schedule_a = state.point_schedule[schedule_it]; \ - uint64_t schedule_b = state.point_schedule[schedule_it + 1]; \ - uint64_t schedule_c = state.point_schedule[schedule_it + 2]; \ - uint64_t schedule_d = state.point_schedule[schedule_it + 3]; \ - uint64_t schedule_e = state.point_schedule[schedule_it + 4]; \ - uint64_t schedule_f = state.point_schedule[schedule_it + 5]; \ - uint64_t schedule_g = state.point_schedule[schedule_it + 6]; \ - uint64_t schedule_h = state.point_schedule[schedule_it + 7]; \ - uint64_t schedule_i = state.point_schedule[schedule_it + 8]; \ - uint64_t schedule_j = state.point_schedule[schedule_it + 9]; \ - uint64_t schedule_k = state.point_schedule[schedule_it + 10]; \ - uint64_t schedule_l = state.point_schedule[schedule_it + 11]; \ - uint64_t schedule_m = state.point_schedule[schedule_it + 12]; \ - uint64_t schedule_n = state.point_schedule[schedule_it + 13]; \ - uint64_t schedule_o = state.point_schedule[schedule_it + 14]; \ - uint64_t schedule_p = state.point_schedule[schedule_it + 15]; \ - \ - g1::conditional_negate_affine( \ - state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL), \ - state.point_pairs_1 + current_offset + 10, \ - (schedule_k >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL), \ - state.point_pairs_1 + current_offset + 11, \ - (schedule_l >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL), \ - state.point_pairs_1 + current_offset + 12, \ - (schedule_m >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL), \ - state.point_pairs_1 + current_offset + 13, \ - (schedule_n >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL), \ - state.point_pairs_1 + current_offset + 14, \ - (schedule_o >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL), \ - state.point_pairs_1 + current_offset + 15, \ - (schedule_p >> 31ULL) & 1ULL); \ - \ - current_offset += 16; \ - schedule_it += 16; - -namespace barretenberg { -namespace scalar_multiplication { - -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points) -{ - // iterate backwards, so that `points` and `table` can point to the same memory location - fq beta = fq::cube_root_of_unity(); - for (size_t i = num_points - 1; i < num_points; --i) { - table[i * 2] = points[i]; - table[i * 2 + 1].x = beta * points[i].x; - table[i * 2 + 1].y = -points[i].y; - } -} - -/** - * Compute the windowed-non-adjacent-form versions of our scalar multipliers. - * - * We start by splitting our 254 bit scalars into 2 127-bit scalars, using the short weierstrass curve endomorphism - * (for a point P \in \G === (x, y) \in \Fq, then (\beta x, y) = (\lambda) * P , where \beta = 1^{1/3} mod Fq and - *\lambda = 1^{1/3} mod Fr) (which means we can represent a scalar multiplication (k * P) as (k1 * P + k2 * \lambda * - *P), where k1, k2 have 127 bits) (see field::split_into_endomorphism_scalars for more details) - * - * Once we have our 127-bit scalar multipliers, we determine the optimal number of pippenger rounds, given the number of - *points we're multiplying. Once we have the number of rounds, `m`, we need to split our scalar into `m` bit-slices. - *Each pippenger round will work on one bit-slice. - * - * Pippenger's algorithm works by, for each round, iterating over the points we're multplying. For each point, we - *examing the point's scalar multiplier and extract the bit-slice associated with the current pippenger round (we start - *with the most significant slice). We then use the bit-slice to index a 'bucket', which we add the point into. For - *example, if the bit slice is 01101, we add the corresponding point into bucket[13]. - * - * At the end of each pippenger round we concatenate the buckets together. E.g. if we have 8 buckets, we compute: - * sum = bucket[0] + 2 * bucket[1] + 3 * bucket[2] + 4 * bucket[3] + 5 * bucket[4] + 6 * bucket[5] + 7 * bucket[6] + 8 * - *bucket[7]. - * - * At the end of each pippenger round, the bucket sum will contain the scalar multiplication result for one bit slice. - * For example, say we have 16 rounds, where each bit slice contains 8 bits (8 * 16 = 128, enough to represent our 127 - *bit scalars). At the end of the first round, we will have taken the 8 most significant bits from every scalar - *multiplier. Our bucket sum will be the result of a mini-scalar-multiplication, where we have multiplied every point by - *the 8 most significant bits of each point's scalar multiplier. - * - * We repeat this process for every pippenger round. In our example, this gives us 16 bucket sums. - * We need to multiply the most significant bucket sum by 2^{120}, the second most significant bucket sum by 2^{112} - *etc. Once this is done we can add the bucket sums together, to evaluate our scalar multiplication result. - * - * Pippenger has complexity O(n / logn), because of two factors at play: the number of buckets we need to concatenate - *per round, and the number of points we need to add into buckets per round. - * - * To minimize the number of point additions per round, we want fewer rounds. But fewer rounds increases the number of - *bucket concatenations. The more points we have, the greater the time saving when reducing the number of rounds, which - *means we can afford to have more buckets per round. - * - * For a concrete example, with 2^20 points, the sweet spot is 2^15 buckets - with 2^15 buckets we can evaluate our 127 - *bit scalar multipliers in 8 rounds (we can represent b-bit windows with 2^{b-1} buckets, more on that below). - * - * This means that, for each round, we add 2^21 points into buckets (we've split our scalar multpliers into two - *half-width multipliers, so each round has twice the number of points. This is the reason why the endormorphism is - *useful here; without the endomorphism, we would need twice the number of buckets for each round). - * - * We also concatenate 2^15 buckets for each round. This requires 2^16 point additions. - * - * Meaning that the total number of point additions is (8 * 2^21) + (8 * 2^16) = 33 * 2^19 ~ 2^24 point additions. - * If we were to use a simple Montgomery double-and-add ladder to exponentiate each point, we would need 2^27 point - *additions (each scalar multiplier has ~2^7 non-zero bits, and there are 2^20 points). - * - * This makes pippenger 8 times faster than the naive O(n) equivalent. Given that a circuit with 1 million gates will - *require 9 multiple-scalar-multiplications with 2^20 points, efficiently using Pippenger's algorithm is essential for - *fast provers - * - * One additional efficiency gain is the use of 2^{b-1} buckets to represent b bits. To do this we represent our - *bit-slices in non-adjacent form. Non-adjacent form represents values using a base, where each 'bit' can take the - *values (-1, 0, 1). This is considerably more efficient than binary form for scalar multiplication, as inverting a - *point can be done by negating the y-coordinate. - * - * We actually use a slightly different representation than simple non-adjacent form. To represent b bits, a bit slice - *contains values from (-2^{b} - 1, ..., -1, 1, ..., 2^{b} - 1). i.e. we only have odd values. We do this to eliminate - *0-valued windows, as having a conditional branch in our hot loop to check if an entry is 0 is somethin we want to - *avoid. - * - * The above representation can be used to represent any binary number as long as we add a 'skew' factor. Each scalar - *multiplier's `skew` tracks if the scalar multiplier is even or odd. If it's even, `skew = true`, and we add `1` to our - *multiplier to make it odd. - * - * We then, at the end of the Pippenger algorithm, subtract a point from the total result, if that point's skew is - *`true`. - * - * At the end of `compute_wnaf_states`, `state.wnaf_table` will contain our wnaf entries, but unsorted. - * - * @param point_schedule Pointer to the output array with all WNAFs - * @param input_skew_table Pointer to the output array with all skews - * @param round_counts The number of points in each round - * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF - * @param num_initial_points The number of points before the endomorphism split - **/ -void compute_wnaf_states(uint64_t* point_schedule, - bool* input_skew_table, - uint64_t* round_counts, - const fr* scalars, - const size_t num_initial_points) -{ - const size_t num_points = num_initial_points * 2; - constexpr size_t MAX_NUM_ROUNDS = 256; - constexpr size_t MAX_NUM_THREADS = 128; - const size_t num_rounds = get_num_rounds(num_points); - const size_t bits_per_bucket = get_optimal_bucket_width(num_initial_points); - const size_t wnaf_bits = bits_per_bucket + 1; -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t num_initial_points_per_thread = num_initial_points / num_threads; - const size_t num_points_per_thread = num_points / num_threads; - std::array, MAX_NUM_THREADS> thread_round_counts; - for (size_t i = 0; i < num_threads; ++i) { - for (size_t j = 0; j < num_rounds; ++j) { - thread_round_counts[i][j] = 0; - } - } -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_threads; ++i) { - fr T0; - uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread]; - const fr* thread_scalars = &scalars[i * num_initial_points_per_thread]; - bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread]; - uint64_t offset = i * num_points_per_thread; - - for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) { - T0 = thread_scalars[j].from_montgomery_form(); - fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]); - - wnaf::fixed_wnaf_with_counts(&T0.data[0], - &wnaf_table[(j << 1UL)], - skew_table[j << 1ULL], - &thread_round_counts[i][0], - ((j << 1ULL) + offset) << 32ULL, - num_points, - wnaf_bits); - wnaf::fixed_wnaf_with_counts(&T0.data[2], - &wnaf_table[(j << 1UL) + 1], - skew_table[(j << 1UL) + 1], - &thread_round_counts[i][0], - ((j << 1UL) + offset + 1) << 32UL, - num_points, - wnaf_bits); - } - } - - for (size_t i = 0; i < num_rounds; ++i) { - round_counts[i] = 0; - } - for (size_t i = 0; i < num_threads; ++i) { - for (size_t j = 0; j < num_rounds; ++j) { - round_counts[j] += thread_round_counts[i][j]; - } - } -} - -/** - * Sorts our wnaf entries in increasing bucket order (per round). - * We currently don't multi-thread the inner sorting algorithm, and just split our threads over the number of rounds. - * A multi-threaded sorting algorithm could be more efficient, but the total runtime of `organize_buckets` is <5% of - * pippenger's runtime, so not a priority. - **/ -void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t num_points) -{ - const size_t num_rounds = get_num_rounds(num_points); -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_rounds; ++i) { - scalar_multiplication::process_buckets(&point_schedule[i * num_points], - num_points, - static_cast(get_optimal_bucket_width(num_points / 2)) + 1); - } -} - -/** - * adds a bunch of points together using affine addition formulae. - * Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to perform. - * Affine formula: - * - * \lambda = (y_2 - y_1) / (x_2 - x_1) - * x_3 = \lambda^2 - (x_2 + x_1) - * y_3 = \lambda*(x_1 - x_3) - y_1 - * - * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse, - * which is outrageously expensive. - * - * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0. - * - * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \} - * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products. - * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n] - * We then compute a single inverse: I = 1 / x_1x_2...x_n - * Finally, we can use our accumulated products, to quotient out individual inverses. - * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1) - * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we - * require. - * - * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse. - * Which means we can compute a point addition with SIX field multiplications in total. - * The traditional Jacobian-coordinate formula requires 11. - * - * There is a catch though - we need large sequences of independent point additions! - * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the sequence. - * - * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated - **/ -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space) -{ - fq batch_inversion_accumulator = fq::one(); - - for (size_t i = 0; i < num_points; i += 2) { - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 - points[i + 1].x -= points[i].x; // x2 - x1 - points[i + 1].y -= points[i].y; // y2 - y1 - points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); - } - - if (batch_inversion_accumulator == 0) { - throw_or_abort("attempted to invert zero in add_affine_points"); - } else { - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - } - - for (size_t i = (num_points)-2; i < num_points; i -= 2) { - // Memory bandwidth is a bit of a bottleneck here. - // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching - __builtin_prefetch(points + i - 2); - __builtin_prefetch(points + i - 1); - __builtin_prefetch(points + ((i + num_points - 2) >> 1)); - __builtin_prefetch(scratch_space + ((i - 2) >> 1)); - - points[i + 1].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= points[i + 1].x; - points[i + 1].x = points[i + 1].y.sqr(); - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2 - // - x1 - points[i].x -= points[(i + num_points) >> 1].x; - points[i].x *= points[i + 1].y; - points[(i + num_points) >> 1].y = points[i].x - points[i].y; - } -} - -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space) -{ - fq batch_inversion_accumulator = fq::one(); - - for (size_t i = 0; i < num_points; i += 2) { - if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) { - continue; - } - if (points[i].x == points[i + 1].x) { - if (points[i].y == points[i + 1].y) { - // double - scratch_space[i >> 1] = points[i].x + points[i].x; // 2x - fq x_squared = points[i].x.sqr(); - points[i + 1].x = points[i].y + points[i].y; // 2y - points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2 - points[i + 1].y *= batch_inversion_accumulator; - batch_inversion_accumulator *= (points[i + 1].x); - continue; - } - points[i].self_set_infinity(); - points[i + 1].self_set_infinity(); - continue; - } - - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 - points[i + 1].x -= points[i].x; // x2 - x1 - points[i + 1].y -= points[i].y; // y2 - y1 - points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); - } - if (!batch_inversion_accumulator.is_zero()) { - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - } - for (size_t i = (num_points)-2; i < num_points; i -= 2) { - // Memory bandwidth is a bit of a bottleneck here. - // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching - __builtin_prefetch(points + i - 2); - __builtin_prefetch(points + i - 1); - __builtin_prefetch(points + ((i + num_points - 2) >> 1)); - __builtin_prefetch(scratch_space + ((i - 2) >> 1)); - - if (points[i].is_point_at_infinity()) { - points[(i + num_points) >> 1] = points[i + 1]; - continue; - } - if (points[i + 1].is_point_at_infinity()) { - points[(i + num_points) >> 1] = points[i]; - continue; - } - - points[i + 1].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= points[i + 1].x; - points[i + 1].x = points[i + 1].y.sqr(); - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2 - // - x1 - points[i].x -= points[(i + num_points) >> 1].x; - points[i].x *= points[i + 1].y; - points[(i + num_points) >> 1].y = points[i].x - points[i].y; - } -} - -/** - * evaluate a chain of pairwise additions. - * The additions are sequenced into base-2 segments - * i.e. pairs, pairs of pairs, pairs of pairs of pairs etc - * `max_bucket_bits` indicates the largest set of nested pairs in the array, - * which defines the iteration depth - **/ -void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases) -{ - size_t end = state.num_points; - size_t start = 0; - for (size_t i = 0; i < max_bucket_bits; ++i) { - const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i); - start = end - points_in_round; - if (handle_edge_cases) { - add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space); - } else { - add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space); - } - } -} - -/** - * This is the entry point for our 'find a way of evaluating a giant multi-product using affine coordinates' algorithm - * By this point, we have already sorted our pippenger buckets. So we have the following situation: - * - * 1. We have a defined number of buckets points - * 2. We have a defined number of points, that need to be added into these bucket points - * 3. number of points >> number of buckets - * - * The algorithm begins by counting the number of points assigned to each bucket. - * For each bucket, we then take this count and split it into its base-2 components. - * e.g. if bucket[3] has 14 points, we split that into a sequence of (8, 4, 2) - * This base-2 splitting is useful, because we can take the bucket's associated points, and - * sort them into pairs, quads, octs etc. These mini-addition sequences are independent from one another, - * which means that we can use the affine trick to evaluate them. - * Once we're done, we have effectively reduced the number of points in the bucket to a logarithmic factor of the input. - * e.g. in the above example, once we've evaluated our pairwise addition of 8, 4 and 2 elements, - * we're left with 3 points. - * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points. - * We repeat this process until every bucket only has one point assigned to it. - **/ -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases) -{ - - // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now(); - // This method sorts our points into our required base-2 sequences. - // `max_bucket_bits` is log2(maximum bucket count). - // This sets the upper limit on how many iterations we need to perform in `evaluate_addition_chains`. - // e.g. if `max_bucket_bits == 3`, then we have at least one bucket with >= 8 points in it. - // which means we need to repeat our pairwise addition algorithm 3 times - // (e.g. add 4 pairs together to get 2 pairs, add those pairs together to get a single pair, which we add to reduce - // to our final point) - const size_t max_bucket_bits = construct_addition_chains(state, first_round); - - // if max_bucket_bits is 0, we're done! we can return - if (max_bucket_bits == 0) { - return state.point_pairs_1; - } - - // compute our required additions using the affine trick - evaluate_addition_chains(state, max_bucket_bits, handle_edge_cases); - - // this next step is a processing step, that computes a new point schedule for our reduced points. - // In the pippenger algorithm, we use a 64-bit uint to categorize each point. - // The high 32 bits describes the position of the point in a point array. - // The low 31 bits describes the bucket index that the point maps to - // The 32nd bit defines whether the point is actually a negation of our stored point. - - // We want to compute these 'point schedule' uints for our reduced points, so that we can recurse back into - // `reduce_buckets` - uint32_t start = 0; - const uint32_t end = static_cast(state.num_points); - // The output of `evaluate_addition_chains` has a bit of an odd structure, should probably refactor. - // Effectively, we used to have one big 1d array, and the act of computing these pair-wise point additions - // has chopped it up into sequences of smaller 1d arrays, with gaps in between - for (size_t i = 0; i < max_bucket_bits; ++i) { - const uint32_t points_in_round = - (static_cast(state.num_points) - state.bit_offsets[i + 1]) >> static_cast(i); - const uint32_t points_removed = points_in_round / 2; - - start = end - points_in_round; - const uint32_t modified_start = start + points_removed; - state.bit_offsets[i + 1] = modified_start; - } - - // iterate over each bucket. Identify how many remaining points there are, and compute their point scheduels - uint32_t new_num_points = 0; - for (size_t i = 0; i < state.num_buckets; ++i) { - uint32_t& count = state.bucket_counts[i]; - uint32_t num_bits = numeric::get_msb(count) + 1; - uint32_t new_bucket_count = 0; - for (size_t j = 0; j < num_bits; ++j) { - uint32_t& current_offset = state.bit_offsets[j]; - const bool has_entry = ((count >> j) & 1) == 1; - if (has_entry) { - uint64_t schedule = (static_cast(current_offset) << 32ULL) + i; - state.point_schedule[new_num_points++] = schedule; - ++new_bucket_count; - ++current_offset; - } - } - count = new_bucket_count; - } - - // modify `num_points` to reflect the new number of reduced points. - // also swap around the `point_pairs` pointer; what used to be our temporary array - // has now become our input point array - g1::affine_element* temp = state.point_pairs_1; - state.num_points = new_num_points; - state.points = state.point_pairs_1; - state.point_pairs_1 = state.point_pairs_2; - state.point_pairs_2 = temp; - - // We could probably speed this up by unroling the recursion. - // But each extra call to `reduce_buckets` has an input size that is ~log(previous input size) - // so the extra run-time is meh - return reduce_buckets(state, false, handle_edge_cases); -} - -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts) -{ - // if this is the first call to `construct_addition_chains`, we need to count up our buckets - if (empty_bucket_counts) { - memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets); - const uint32_t first_bucket = static_cast(state.point_schedule[0] & 0x7fffffffUL); - for (size_t i = 0; i < state.num_points; ++i) { - size_t bucket_index = static_cast(state.point_schedule[i] & 0x7fffffffUL); - ++state.bucket_counts[bucket_index - first_bucket]; - } - for (size_t i = 0; i < state.num_buckets; ++i) { - state.bucket_empty_status[i] = (state.bucket_counts[i] == 0); - } - } - - uint32_t max_count = 0; - for (size_t i = 0; i < state.num_buckets; ++i) { - max_count = state.bucket_counts[i] > max_count ? state.bucket_counts[i] : max_count; - } - - const uint32_t max_bucket_bits = numeric::get_msb(max_count); - - for (size_t i = 0; i < max_bucket_bits + 1; ++i) { - state.bit_offsets[i] = 0; - } - - // theoretically, can be unrolled using templated methods. - // However, explicitly unrolling the loop by using recursive template calls was slower! - // Inner loop is currently bounded by a constexpr variable, need to see what the compiler does with that... - count_bits(state.bucket_counts, &state.bit_offsets[0], state.num_buckets, max_bucket_bits); - - // we need to update `bit_offsets` to compute our point shuffle, - // but we need the original array later on, so make a copy. - std::array bit_offsets_copy = { 0 }; - for (size_t i = 0; i < max_bucket_bits + 1; ++i) { - bit_offsets_copy[i] = state.bit_offsets[i]; - } - - // this is where we take each bucket's associated points, and arrange them - // in a pairwise order, so that we can compute large sequences of additions using the affine trick - size_t schedule_it = 0; - uint32_t* bucket_count_it = state.bucket_counts; - - for (size_t i = 0; i < state.num_buckets; ++i) { - uint32_t count = *bucket_count_it; - ++bucket_count_it; - uint32_t num_bits = numeric::get_msb(count) + 1; - for (size_t j = 0; j < num_bits; ++j) { - uint32_t& current_offset = bit_offsets_copy[j]; - const size_t k_end = count & (1UL << j); - // This section is a bottleneck - to populate our point array, we need - // to read from memory locations that are effectively uniformly randomly distributed! - // (assuming our scalar multipliers are uniformly random...) - // In the absence of a more elegant solution, we use ugly macro hacks to try and - // unroll loops, and prefetch memory a few cycles before we need it - switch (k_end) { - case 64: { - [[fallthrough]]; - } - case 32: { - [[fallthrough]]; - } - case 16: { - for (size_t k = 0; k < (k_end >> 4); ++k) { - BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK; - } - break; - } - case 8: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 8] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 9] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 10] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 11] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 12] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 13] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 14] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 15] >> 32ULL)); - - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - const uint64_t schedule_c = state.point_schedule[schedule_it + 2]; - const uint64_t schedule_d = state.point_schedule[schedule_it + 3]; - const uint64_t schedule_e = state.point_schedule[schedule_it + 4]; - const uint64_t schedule_f = state.point_schedule[schedule_it + 5]; - const uint64_t schedule_g = state.point_schedule[schedule_it + 6]; - const uint64_t schedule_h = state.point_schedule[schedule_it + 7]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL), - state.point_pairs_1 + current_offset + 4, - (schedule_e >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL), - state.point_pairs_1 + current_offset + 5, - (schedule_f >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL), - state.point_pairs_1 + current_offset + 6, - (schedule_g >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL), - state.point_pairs_1 + current_offset + 7, - (schedule_h >> 31ULL) & 1ULL); - - current_offset += 8; - schedule_it += 8; - break; - } - case 4: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - const uint64_t schedule_c = state.point_schedule[schedule_it + 2]; - const uint64_t schedule_d = state.point_schedule[schedule_it + 3]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); - current_offset += 4; - schedule_it += 4; - break; - } - case 2: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - current_offset += 2; - schedule_it += 2; - break; - } - case 1: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - ++current_offset; - ++schedule_it; - break; - } - case 0: { - break; - } - default: { - for (size_t k = 0; k < k_end; ++k) { - uint64_t schedule = state.point_schedule[schedule_it]; - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 1] >> 32ULL)); - - const uint64_t predicate = (schedule >> 31UL) & 1UL; - - g1::conditional_negate_affine( - state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate); - ++current_offset; - ++schedule_it; - } - } - } - } - } - return max_bucket_bits; -} - -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases) -{ - const size_t num_rounds = get_num_rounds(num_points); -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - - std::unique_ptr thread_accumulators( - static_cast(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free); - -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t j = 0; j < num_threads; ++j) { - thread_accumulators[j].self_set_infinity(); - - for (size_t i = 0; i < num_rounds; ++i) { - - const uint64_t num_round_points = state.round_counts[i]; - - g1::element accumulator; - accumulator.self_set_infinity(); - - if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) { - } else { - - const uint64_t num_round_points_per_thread = num_round_points / num_threads; - const uint64_t leftovers = - (j == num_threads - 1) ? (num_round_points) - (num_round_points_per_thread * num_threads) : 0; - - uint64_t* thread_point_schedule = - &state.point_schedule[(i * num_points) + j * num_round_points_per_thread]; - const size_t first_bucket = thread_point_schedule[0] & 0x7fffffffU; - const size_t last_bucket = - thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU; - const size_t num_thread_buckets = (last_bucket - first_bucket) + 1; - - affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j); - product_state.num_points = static_cast(num_round_points_per_thread + leftovers); - product_state.points = points; - product_state.point_schedule = thread_point_schedule; - product_state.num_buckets = static_cast(num_thread_buckets); - g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases); - g1::element running_sum; - running_sum.self_set_infinity(); - - // one nice side-effect of the affine trick, is that half of the bucket concatenation - // algorithm can use mixed addition formulae, instead of full addition formulae - size_t output_it = product_state.num_points - 1; - for (size_t k = num_thread_buckets - 1; k > 0; --k) { - if (__builtin_expect(!product_state.bucket_empty_status[k], 1)) { - running_sum += (output_buckets[output_it]); - --output_it; - } - accumulator += running_sum; - } - running_sum += output_buckets[0]; - accumulator.self_dbl(); - accumulator += running_sum; - - // we now need to scale up 'running sum' up to the value of the first bucket. - // e.g. if first bucket is 0, no scaling - // if first bucket is 1, we need to add (2 * running_sum) - if (first_bucket > 0) { - uint32_t multiplier = static_cast(first_bucket << 1UL); - size_t shift = numeric::get_msb(multiplier); - g1::element rolling_accumulator = g1::point_at_infinity; - bool init = false; - while (shift != static_cast(-1)) { - if (init) { - rolling_accumulator.self_dbl(); - if (((multiplier >> shift) & 1)) { - rolling_accumulator += running_sum; - } - } else { - rolling_accumulator += running_sum; - } - init = true; - shift -= 1; - } - accumulator += rolling_accumulator; - } - } - - if (i == (num_rounds - 1)) { - const size_t num_points_per_thread = num_points / num_threads; - bool* skew_table = &state.skew_table[j * num_points_per_thread]; - g1::affine_element* point_table = &points[j * num_points_per_thread]; - g1::affine_element addition_temporary; - for (size_t k = 0; k < num_points_per_thread; ++k) { - if (skew_table[k]) { - addition_temporary = -point_table[k]; - accumulator += addition_temporary; - } - } - } - - if (i > 0) { - for (size_t k = 0; k < bits_per_bucket + 1; ++k) { - thread_accumulators[j].self_dbl(); - } - } - thread_accumulators[j] += accumulator; - } - } - - g1::element result; - result.self_set_infinity(); - for (size_t i = 0; i < num_threads; ++i) { - result += thread_accumulators[i]; - } - return result; -} - -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) -{ - // multiplication_runtime_state state; - compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2); - g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases); - return result; -} - -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) -{ - // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points. - // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm. - // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger -#ifndef NO_MULTITHREADING - const size_t threshold = std::max(max_threads::compute_num_threads() * 8, 8UL); -#else - const size_t threshold = 8UL; -#endif - - if (num_initial_points == 0) { - g1::element out = g1::one; - out.self_set_infinity(); - return out; - } - - if (num_initial_points <= threshold) { - std::vector exponentiation_results(num_initial_points); - // might as well multithread this... - // Possible optimization: use group::batch_mul_with_endomorphism here. -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_initial_points; ++i) { - exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i]; - } - - for (size_t i = num_initial_points - 1; i > 0; --i) { - exponentiation_results[i - 1] += exponentiation_results[i]; - } - return exponentiation_results[0]; - } - - const size_t slice_bits = static_cast(numeric::get_msb(static_cast(num_initial_points))); - const size_t num_slice_points = static_cast(1ULL << slice_bits); - - g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases); - - if (num_slice_points != num_initial_points) { - const uint64_t leftover_points = num_initial_points - num_slice_points; - return result + pippenger(scalars + num_slice_points, - points + static_cast(num_slice_points * 2), - static_cast(leftover_points), - state, - handle_edge_cases); - } else { - return result; - } -} - -/** - * It's pippenger! But this one has go-faster stripes and a prediliction for questionable life choices. - * We use affine-addition formula in this method, which paradoxically is ~45% faster than the mixed addition formulae. - * See `scalar_multiplication.cpp` for a more detailed description. - * - * It's...unsafe, because we assume that the incomplete addition formula exceptions are not triggered. - * We don't bother to check for this to avoid conditional branches in a critical section of our code. - * This is fine for situations where your bases are linearly independent (i.e. KZG10 polynomial commitments), - * because triggering the incomplete addition exceptions is about as hard as solving the disrete log problem. - * - * This is ok for the prover, but GIANT RED CLAXON WARNINGS FOR THE VERIFIER - * Don't use this in a verification algorithm! That would be a really bad idea. - * Unless you're a malicious adversary, then it would be a great idea! - * - **/ -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) -{ - return pippenger(scalars, points, num_initial_points, state, false); -} -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) -{ - std::vector G_mod(num_initial_points * 2); - barretenberg::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points); - return pippenger(scalars, &G_mod[0], num_initial_points, state, false); -} -} // namespace scalar_multiplication -} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp deleted file mode 100644 index 36613a47a9..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp +++ /dev/null @@ -1,155 +0,0 @@ -#pragma once - -#include "../fr.hpp" -#include "../g1.hpp" -#include "./runtime_states.hpp" -#include -#include - -namespace barretenberg { -namespace scalar_multiplication { - -constexpr size_t get_num_buckets(const size_t num_points) -{ - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - return 1UL << bits_per_bucket; -} - -/** - * pointers that describe how to add points into buckets, for the pippenger algorithm. - * `wnaf_table` is an unrolled two-dimensional array, with each inner array being of size `n`, - * where `n` is the number of points being multiplied. The second dimension size is defined by - * the number of pippenger rounds (fixed for a given `n`, see `get_num_rounds`) - * - * An entry of `wnaf_table` contains the following three pieces of information: - * 1: the point index that we're working on. This is stored in the high 32 bits - * 2: the bucket index that we're adding the point into. This is stored in the low 31 bits - * 3: the sign of the point we're adding (i.e. do we actually need to subtract). This is stored in the 32nd bit. - * - * We pack this information into a 64 bit unsigned integer, so that we can more efficiently sort our wnaf entries. - * For a given round, we want to sort our wnaf entries in increasing bucket index order. - * - * This is so that we can efficiently use multiple threads to execute the pippenger algorithm. - * For a given round, a given point's bucket index will be uniformly randomly distributed, - * assuming the inputs are from a zero-knowledge proof. This is because the scalar multiplier will be uniformly randomly - *distributed, and the bucket indices are derived from the scalar multiplier. - * - * This means that, if we were to iterate over all of our points in order, and add each point into its associated - *bucket, we would be accessing all of our buckets in a completely random pattern. - * - * Aside from memory latency problems this incurs, this makes the naive algorithm unsuitable for multithreading - we - *cannot assign a thread a tranche of points, because each thread will be adding points into the same set of buckets, - *triggering race conditions. We do not want to manage the overhead of thread locks for each bucket; the process of - *adding a point into a bucket takes, on average, only 400 CPU cycles, so the slowdown of managing mutex locks would add - *considerable overhead. - * - * The solution is to sort the buckets. If the buckets are sorted, we can assign a tranche of buckets to individual - *threads, safe in the knowledge that there will be no race conditions, with one condition. A thread's starting bucket - *may be equal to the previous thread's end bucket, so we need to ensure that each thread works on a local array of - *buckets. This adds little overhead (for 2^20 points, we have 32,768 buckets. With 8 threads, the amount of bucket - *overlap is ~16 buckets, so we could incur 16 extra 'additions' in pippenger's bucket concatenation phase, but this is - *an insignificant contribution). - * - * The alternative approach (the one we used to use) is to slice up all of the points being multiplied amongst all - *available threads, and run the complete pippenger algorithm for each thread. This is suboptimal, because the - *complexity of pippenger is O(n / logn) point additions, and a sequence of smaller pippenger calls will have a smaller - *`n`. - * - * This is the motivation for multi-threading the actual Pippenger algorithm. In addition, the above approach performs - *extremely poorly for GPUs, where the number of threads can be as high as 2^10 (for a multi-scalar-multiplication of - *2^20 points, this doubles the number of pippenger rounds per thread) - * - * To give concrete numbers, the difference between calling pippenger on 2^20 points, and calling pippenger 8 times on - *2^17 points, is 5-10%. Which means that, for 8 threads, we need to ensure that our sorting algorithm adds less than 5% - *to the total runtime of pippenger. Given a single cache miss per point would increase the run-time by 25%, this is not - *much room to work with! - * - * However, a radix sort, combined with the fact that the total number of buckets is quite small (2^16 at most), seems - *to be fast enough. Benchmarks indicate (i7-8650U, 8 threads) that, for 2^20 points, the total runtime is <1200ms and - *of that, the radix sort consumes 58ms (4.8%) - * - * One advantage of sorting by bucket order vs point order, is that a 'bucket' is 96 bytes large (sizeof(g1::element), - *buckets have z-coordinates). Points, on the other hand, are 64 bytes large (affine points, no z-coordinate). This - *makes fetching random point locations in memory more efficient than fetching random bucket locations, as each point - *occupies a single cache line. Using __builtin_prefetch to recover the point just before it's needed, seems to improve - *the runtime of pippenger by 10-20%. - * - * Finally, `skew_table` tracks whether a scalar multplier is even or odd - * (if it's even, we need to subtract the point from the total result, - * because our windowed non-adjacent form values can only be odd) - * - **/ - -struct multiplication_thread_state { - g1::element* buckets; - const uint64_t* point_schedule; -}; - -void compute_wnaf_states(uint64_t* point_schedule, - bool* input_skew_table, - uint64_t* round_counts, - const fr* scalars, - const size_t num_initial_points); - -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points); - -void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points); - -inline void count_bits(uint32_t* bucket_counts, - uint32_t* bit_offsets, - const uint32_t num_buckets, - const size_t num_bits) -{ - for (size_t i = 0; i < num_buckets; ++i) { - const uint32_t count = bucket_counts[i]; - for (uint32_t j = 0; j < num_bits; ++j) { - bit_offsets[j + 1] += (count & (1U << j)); - } - } - bit_offsets[0] = 0; - for (size_t i = 2; i < num_bits + 1; ++i) { - bit_offsets[i] += bit_offsets[i - 1]; - } -} - -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true); - -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space); -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space); - -void evaluate_addition_chains(affine_product_runtime_state& state, - const size_t max_bucket_bits, - bool handle_edge_cases); - -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases); - -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases = false); - -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, - bool first_round = true, - bool handle_edge_cases = false); - -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_points, - pippenger_runtime_state& state, - bool handle_edge_cases = true); - -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); - -} // namespace scalar_multiplication -} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp deleted file mode 100644 index bd0875ba96..0000000000 --- a/cpp/src/barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.test.cpp +++ /dev/null @@ -1,937 +0,0 @@ -#include "pippenger.hpp" -#include "scalar_multiplication.hpp" -#include -#include "barretenberg/common/test.hpp" -#include "barretenberg/srs/io.hpp" -#include - -#include "barretenberg/numeric/random/engine.hpp" - -#include "barretenberg/common/mem.hpp" - -#define BARRETENBERG_SRS_PATH "../srs_db/ignition" - -using namespace barretenberg; -using namespace barretenberg::scalar_multiplication; - -namespace { -auto& engine = numeric::random::get_debug_engine(); -} - -TEST(scalar_multiplication, reduce_buckets_simple) -{ - constexpr size_t num_points = 128; - g2::affine_element g2_x; - io::read_transcript_g2(g2_x, BARRETENBERG_SRS_PATH); - auto pippenger = Pippenger(BARRETENBERG_SRS_PATH, num_points / 2); - auto monomials = pippenger.get_point_table(); - - std::vector point_schedule(scalar_multiplication::point_table_size(num_points / 2)); - std::array bucket_empty_status; - // 16 buckets, each bucket has one point - std::array transcript; - std::array transcript_points; - transcript_points[0] = 0x0; - transcript_points[1] = 0x2; - transcript_points[2] = 0x4; - transcript_points[3] = 0x6; - transcript_points[4] = 0xb; - transcript_points[5] = 0xc; - transcript_points[6] = 0xe; - transcript_points[7] = 0x11; - transcript_points[8] = 0x13; - transcript_points[9] = 0x14; - transcript_points[10] = 0x15; - transcript_points[11] = 0x16; - transcript_points[12] = 0x17; - transcript_points[13] = 0x18; - transcript_points[14] = 0x20; - transcript_points[15] = 0x21; - transcript_points[16] = 0x22; - transcript_points[17] = 0x27; - transcript_points[18] = 0x29; - transcript_points[19] = 0x2b; - transcript_points[20] = 0x2c; - transcript_points[21] = 0x2d; - transcript_points[22] = 0x2e; - transcript_points[23] = 0x36; - transcript_points[24] = 0x37; - transcript_points[25] = 0x38; - transcript_points[26] = 0x3e; - transcript_points[27] = 0x3f; - transcript_points[28] = 0x4e; - transcript_points[29] = 0x4f; - transcript_points[30] = 0x50; - transcript_points[31] = 0x51; - transcript_points[32] = 0x41; - transcript_points[33] = 0x52; - transcript_points[34] = 0x53; - transcript_points[35] = 0x54; - transcript_points[36] = 0x43; - transcript_points[37] = 0x57; - transcript_points[38] = 0x46; - transcript_points[39] = 0x58; - transcript_points[40] = 0x5b; - transcript_points[41] = 0x5e; - transcript_points[42] = 0x42; - transcript_points[43] = 0x47; - transcript_points[44] = 0x4b; - transcript_points[45] = 0x4d; - transcript_points[46] = 0x6b; - transcript_points[47] = 0x65; - transcript_points[48] = 0x6d; - transcript_points[49] = 0x67; - transcript_points[50] = 0x6f; - transcript_points[51] = 0x68; - transcript_points[52] = 0x69; - transcript_points[53] = 0x6a; - transcript_points[54] = 0x71; - transcript_points[55] = 0x72; - transcript_points[56] = 0x73; - transcript_points[57] = 0x74; - transcript_points[58] = 0x75; - transcript_points[59] = 0x66; - transcript_points[60] = 0x79; - transcript_points[62] = 0x7c; - transcript_points[61] = 0x7e; - transcript_points[63] = 0x7f; - transcript_points[64] = 0x1; - transcript_points[65] = 0x3; - transcript_points[66] = 0x5; - transcript_points[67] = 0x7; - transcript_points[68] = 0x8; - transcript_points[69] = 0x9; - transcript_points[70] = 0xa; - transcript_points[71] = 0xd; - transcript_points[72] = 0xf; - transcript_points[73] = 0x10; - transcript_points[74] = 0x12; - transcript_points[75] = 0x19; - transcript_points[76] = 0x1a; - transcript_points[77] = 0x1b; - transcript_points[78] = 0x1c; - transcript_points[79] = 0x1d; - transcript_points[80] = 0x1e; - transcript_points[81] = 0x1f; - transcript_points[82] = 0x23; - transcript_points[83] = 0x24; - transcript_points[84] = 0x25; - transcript_points[85] = 0x26; - transcript_points[86] = 0x28; - transcript_points[87] = 0x2a; - transcript_points[88] = 0x2f; - transcript_points[89] = 0x30; - transcript_points[90] = 0x31; - transcript_points[91] = 0x32; - transcript_points[92] = 0x33; - transcript_points[93] = 0x34; - transcript_points[94] = 0x35; - transcript_points[95] = 0x39; - transcript_points[96] = 0x3a; - transcript_points[97] = 0x3b; - transcript_points[98] = 0x3c; - transcript_points[99] = 0x3d; - transcript_points[100] = 0x48; - transcript_points[101] = 0x49; - transcript_points[102] = 0x55; - transcript_points[103] = 0x56; - transcript_points[104] = 0x4a; - transcript_points[105] = 0x44; - transcript_points[106] = 0x45; - transcript_points[107] = 0x40; - transcript_points[108] = 0x59; - transcript_points[109] = 0x5a; - transcript_points[110] = 0x5c; - transcript_points[111] = 0x5d; - transcript_points[112] = 0x5f; - transcript_points[113] = 0x60; - transcript_points[114] = 0x61; - transcript_points[115] = 0x62; - transcript_points[116] = 0x63; - transcript_points[117] = 0x4c; - transcript_points[118] = 0x6c; - transcript_points[119] = 0x6e; - transcript_points[120] = 0x64; - transcript_points[121] = 0x70; - transcript_points[122] = 0x77; - transcript_points[123] = 0x78; - transcript_points[124] = 0x76; - transcript_points[125] = 0x7a; - transcript_points[126] = 0x7b; - transcript_points[127] = 0x7d; - - for (size_t i = 0; i < 64; ++i) { - transcript[i] = 0; - transcript[i + 64] = 1; - } - for (size_t i = 0; i < num_points; ++i) { - point_schedule[i] = (static_cast(transcript_points[i]) << 32ULL) + transcript[i]; - } - std::array expected; - for (size_t i = 0; i < num_points; ++i) { - expected[i].self_set_infinity(); - } - - for (size_t i = 0; i < num_points; ++i) { - size_t schedule = transcript[i] & 0x7fffffffU; - { - expected[schedule] += monomials[static_cast(transcript_points[i])]; - } - } - - std::array point_pairs; - std::array output_buckets; - std::array scratch_space; - std::array bucket_counts; - std::array bit_offsets = { 0 }; - - scalar_multiplication::affine_product_runtime_state product_state{ - &monomials[0], &point_pairs[0], &output_buckets[0], - &scratch_space[0], &bucket_counts[0], &bit_offsets[0], - &point_schedule[0], num_points, 2, - &bucket_empty_status[0] - }; - - g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true); - - for (size_t i = 0; i < product_state.num_buckets; ++i) { - expected[i] = expected[i].normalize(); - EXPECT_EQ((output[i].x == expected[i].x), true); - EXPECT_EQ((output[i].y == expected[i].y), true); - } -} - -TEST(scalar_multiplication, reduce_buckets) -{ - constexpr size_t num_initial_points = 1 << 12; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::affine_element* scratch_points = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::affine_element* point_pairs = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::element* expected_buckets = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points * 2))); - bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points * 2))); - - memset((void*)scratch_points, 0x00, (num_points * 2) * sizeof(g1::affine_element)); - memset((void*)point_pairs, 0x00, (num_points * 2) * sizeof(g1::affine_element)); - memset((void*)expected_buckets, 0x00, (num_points * 2) * sizeof(g1::element)); - memset((void*)bucket_empty_status, 0x00, (num_points * 2) * sizeof(bool)); - - fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points))); - - memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); - - g2::affine_element g2_x; - io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); - - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - for (size_t i = 0; i < num_initial_points; ++i) { - scalars[i] = fr::random_element(); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * 100 * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - - uint64_t* point_schedule_copy = static_cast(aligned_alloc(64, sizeof(uint64_t) * num_points * 2)); - for (size_t i = 0; i < num_points; ++i) { - state.point_schedule[i + num_points] = state.point_schedule[i + num_points] & 0xffffffff7fffffffUL; - // printf("state.point_schedule[%lu] = %lx \n", i, state.point_schedule[i]); - point_schedule_copy[i] = state.point_schedule[i + num_points]; - } - const size_t first_bucket = point_schedule_copy[0] & 0x7fffffffULL; - const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - &state.point_schedule[num_points], - num_points, - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - for (size_t i = 0; i < num_points; ++i) { - expected_buckets[i].self_set_infinity(); - } - for (size_t i = 0; i < num_points; ++i) { - uint64_t schedule = point_schedule_copy[i]; - uint64_t bucket_index = schedule & 0x7fffffffU; - uint64_t point_index = schedule >> 32ULL; - uint64_t predicate = (schedule >> 31ULL) & 1ULL; - // printf("expected bucket index = %lu \n", bucket_index - first_bucket); - g1::element& bucket = expected_buckets[bucket_index - first_bucket]; - g1::affine_element& point = monomials[point_index]; - bucket.self_mixed_add_or_sub(point, predicate); - } - - size_t it = 0; - - g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true); - - printf("num buckets = %zu \n", num_buckets); - for (size_t i = 0; i < num_buckets; ++i) { - if (!bucket_empty_status[i]) { - g1::element expected = expected_buckets[i].normalize(); - EXPECT_EQ((expected.x == result_buckets[it].x), true); - EXPECT_EQ((expected.y == result_buckets[it].y), true); - ++it; - } else { - printf("recorded empty bucket???\n"); - } - } - aligned_free(bucket_empty_status); - aligned_free(expected_buckets); - aligned_free(point_schedule_copy); - aligned_free(point_pairs); - aligned_free(scratch_points); - aligned_free(scratch_field); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -// This test intermittenly fails. -TEST(scalar_multiplication, DISABLED_reduce_buckets_basic) -{ - constexpr size_t num_initial_points = 1 << 20; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - g1::affine_element* scratch_points = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - g1::affine_element* point_pairs = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points))); - - fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points))); - - memset((void*)scratch_points, 0x00, num_points * sizeof(g1::affine_element)); - memset((void*)point_pairs, 0x00, num_points * sizeof(g1::affine_element)); - memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); - memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool)); - - g2::affine_element g2_x; - io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < num_initial_points; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL; - const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - (uint32_t)state.round_counts[0], - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::reduce_buckets(product_state, true); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - aligned_free(bucket_empty_status); - aligned_free(point_pairs); - aligned_free(scratch_points); - aligned_free(scratch_field); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -TEST(scalar_multiplication, add_affine_points) -{ - constexpr size_t num_points = 20; - g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - fq* scratch_space = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2))); - fq* lambda = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2))); - - g1::element* points_copy = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points))); - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - points_copy[i].x = points[i].x; - points_copy[i].y = points[i].y; - points_copy[i].z = fq::one(); - } - - size_t count = num_points - 1; - for (size_t i = num_points - 2; i < num_points; i -= 2) { - points_copy[count--] = points_copy[i] + points_copy[i + 1]; - points_copy[count + 1] = points_copy[count + 1].normalize(); - } - - scalar_multiplication::add_affine_points(points, num_points, scratch_space); - for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) { - EXPECT_EQ((points[i].x == points_copy[i].x), true); - EXPECT_EQ((points[i].y == points_copy[i].y), true); - } - aligned_free(lambda); - aligned_free(points); - aligned_free(points_copy); - aligned_free(scratch_space); -} - -TEST(scalar_multiplication, construct_addition_chains) -{ - constexpr size_t num_initial_points = 1 << 20; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - - g2::affine_element g2_x; - io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < num_initial_points; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - bool* bucket_empty_status = static_cast(aligned_alloc(64, num_points * sizeof(bool))); - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL; - const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - monomials, - monomials, - nullptr, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - static_cast(state.round_counts[0]), - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::construct_addition_chains(product_state, true); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - info("construct addition chains: ", diff.count(), "ms"); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - aligned_free(bucket_empty_status); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -TEST(scalar_multiplication, endomorphism_split) -{ - fr scalar = fr::random_element(); - - g1::element expected = g1::one * scalar; - - // we want to test that we can split a scalar into two half-length components, using the same location in memory. - fr* k1_t = &scalar; - fr* k2_t = (fr*)&scalar.data[2]; - - fr::split_into_endomorphism_scalars(scalar, *k1_t, *k2_t); - // The compiler really doesn't like what we're doing here, - // and disabling the array-bounds error project-wide seems unsafe. - // The large macro blocks are here to warn that we should be careful when - // aliasing the arguments to split_into_endomorphism_scalars -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Warray-bounds" -#endif - fr k1{ (*k1_t).data[0], (*k1_t).data[1], 0, 0 }; - fr k2{ (*k2_t).data[0], (*k2_t).data[1], 0, 0 }; -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic pop -#endif - g1::element result; - g1::element t1 = g1::affine_one * k1; - g1::affine_element generator = g1::affine_one; - fq beta = fq::cube_root_of_unity(); - generator.x = generator.x * beta; - generator.y = -generator.y; - g1::element t2 = generator * k2; - result = t1 + t2; - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, radix_sort) -{ - // check that our radix sort correctly sorts! - constexpr size_t target_degree = 1 << 8; - constexpr size_t num_rounds = scalar_multiplication::get_num_rounds(target_degree * 2); - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < target_degree; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(target_degree); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree); - - uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds)); - memcpy((void*)wnaf_copy, (void*)state.point_schedule, sizeof(uint64_t) * target_degree * 2 * num_rounds); - - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, target_degree * 2); - for (size_t i = 0; i < num_rounds; ++i) { - uint64_t* unsorted_wnaf = &wnaf_copy[i * target_degree * 2]; - uint64_t* sorted_wnaf = &state.point_schedule[i * target_degree * 2]; - - const auto find_entry = [unsorted_wnaf, num_entries = target_degree * 2](auto x) { - for (size_t k = 0; k < num_entries; ++k) { - if (unsorted_wnaf[k] == x) { - return true; - } - } - return false; - }; - for (size_t j = 0; j < target_degree * 2; ++j) { - EXPECT_EQ(find_entry(sorted_wnaf[j]), true); - if (j > 0) { - EXPECT_EQ((sorted_wnaf[j] & 0x7fffffffU) >= (sorted_wnaf[j - 1] & 0x7fffffffU), true); - } - } - } - - free(scalars); - free(wnaf_copy); -} - -HEAVY_TEST(scalar_multiplication, oversized_inputs) -{ - // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps. - // Check that this is done correctly - size_t transcript_degree = 1 << 20; - size_t target_degree = 1200000; - g1::affine_element* monomials = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree))); - g2::affine_element g2_x; - io::read_transcript(monomials, g2_x, transcript_degree, BARRETENBERG_SRS_PATH); - - memcpy((void*)(monomials + (2 * transcript_degree)), - (void*)monomials, - ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element))); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree)); - - fr source_scalar = fr::random_element(); - fr accumulator = source_scalar; - for (size_t i = 0; i < target_degree; ++i) { - accumulator *= source_scalar; - fr::__copy(accumulator, scalars[i]); - } - scalar_multiplication::pippenger_runtime_state state(target_degree); - - g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state); - first = first.normalize(); - - for (size_t i = 0; i < target_degree; ++i) { - scalars[i].self_neg(); - } - scalar_multiplication::pippenger_runtime_state state_2(target_degree); - - g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2); - second = second.normalize(); - - EXPECT_EQ((first.z == second.z), true); - EXPECT_EQ((first.z == fq::one()), true); - EXPECT_EQ((first.x == second.x), true); - EXPECT_EQ((first.y == -second.y), true); - - aligned_free(monomials); - aligned_free(scalars); -} - -TEST(scalar_multiplication, undersized_inputs) -{ - // we fall back to traditional scalar multiplication algorithm for small input sizes. - // Check this is done correctly - size_t num_points = 17; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_edge_case_dbl) -{ - constexpr size_t num_points = 128; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - g1::affine_element point = g1::affine_element(g1::element::random_element()); - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = point; - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - if (!expected.is_point_at_infinity()) { - expected = expected.normalize(); - } - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_short_inputs) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = scalar_multiplication::point_table_alloc(num_points); - - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - } - for (size_t i = 0; i < (num_points / 4); ++i) { - scalars[i * 4].data[0] = engine.get_random_uint32(); - scalars[i * 4].data[1] = engine.get_random_uint32(); - scalars[i * 4].data[2] = engine.get_random_uint32(); - scalars[i * 4].data[3] = engine.get_random_uint32(); - scalars[i * 4] = scalars[i * 4].to_montgomery_form(); - scalars[i * 4 + 1].data[0] = 0; - scalars[i * 4 + 1].data[1] = 0; - scalars[i * 4 + 1].data[2] = 0; - scalars[i * 4 + 1].data[3] = 0; - scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form(); - scalars[i * 4 + 2].data[0] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[1] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[2] = 0; - scalars[i * 4 + 2].data[3] = 0; - scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form(); - scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL); - scalars[i * 4 + 3].data[1] = 0; - scalars[i * 4 + 3].data[2] = 0; - scalars[i * 4 + 3].data[3] = 0; - scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form(); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_unsafe) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = scalar_multiplication::point_table_alloc(num_points); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_unsafe_short_inputs) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - } - for (size_t i = 0; i < (num_points / 4); ++i) { - scalars[i * 4].data[0] = engine.get_random_uint32(); - scalars[i * 4].data[1] = engine.get_random_uint32(); - scalars[i * 4].data[2] = engine.get_random_uint32(); - scalars[i * 4].data[3] = engine.get_random_uint32(); - scalars[i * 4] = scalars[i * 4].to_montgomery_form(); - scalars[i * 4 + 1].data[0] = 0; - scalars[i * 4 + 1].data[1] = 0; - scalars[i * 4 + 1].data[2] = 0; - scalars[i * 4 + 1].data[3] = 0; - scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form(); - scalars[i * 4 + 2].data[0] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[1] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[2] = 0; - scalars[i * 4 + 2].data[3] = 0; - scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form(); - scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL); - scalars[i * 4 + 3].data[1] = 0; - scalars[i * 4 + 3].data[2] = 0; - scalars[i * 4 + 3].data[3] = 0; - scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form(); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_one) -{ - size_t num_points = 1; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * 1); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(scalar_multiplication, pippenger_zero_points) -{ - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); - - g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1); - - scalar_multiplication::pippenger_runtime_state state(0); - g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result.is_point_at_infinity(), true); -} - -TEST(scalar_multiplication, pippenger_mul_by_zero) -{ - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); - - g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1); - - scalars[0] = fr::zero(); - points[0] = g1::affine_one; - scalar_multiplication::generate_pippenger_point_table(points, points, 1); - - scalar_multiplication::pippenger_runtime_state state(1); - g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result.is_point_at_infinity(), true); -} diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp index c1a9bc0457..cd575a16d6 100644 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp +++ b/cpp/src/barretenberg/ecc/curves/grumpkin/grumpkin.hpp @@ -39,7 +39,8 @@ class Grumpkin { public: using ScalarField = barretenberg::fq; using BaseField = barretenberg::fr; - using ProjectiveElement = typename grumpkin::g1::element; - using AffineElement = typename grumpkin::g1::affine_element; + using Group = typename grumpkin::g1; + using Element = typename Group::element; + using AffineElement = typename Group::affine_element; }; } // namespace curve \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp deleted file mode 100644 index b490f4973d..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// #include "scalar_multiplication.hpp" -// #include "pippenger.hpp" -// #include "barretenberg/common/mem.hpp" - -// using namespace barretenberg; - -// #define WASM_EXPORT __attribute__((visibility("default"))) - -// extern "C" { - -// WASM_EXPORT void* bbmalloc(size_t size) -// { -// auto ptr = aligned_alloc(64, size); -// return ptr; -// } - -// WASM_EXPORT void bbfree(void* ptr) -// { -// aligned_free(ptr); -// } - -// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points) -// { -// auto ptr = new scalar_multiplication::Pippenger(points, num_points); -// return ptr; -// } - -// WASM_EXPORT void delete_pippenger(void* pippenger) -// { -// delete reinterpret_cast(pippenger); -// } - -// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* -// result_ptr) -// { -// scalar_multiplication::pippenger_runtime_state state(range); -// auto pippenger = reinterpret_cast(pippenger_ptr); -// auto scalars = reinterpret_cast(scalars_ptr); -// auto result = reinterpret_cast(result_ptr); -// *result = pippenger->pippenger_unsafe(scalars, from, range); -// } - -// WASM_EXPORT void g1_sum(void* points_ptr, const size_t num_points, void* result_ptr) -// { -// auto points = reinterpret_cast(points_ptr); -// auto result = reinterpret_cast(result_ptr); -// result->self_set_infinity(); -// *result = std::accumulate(points, points + num_points, *result); -// } -// } diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp deleted file mode 100644 index 60af544690..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/c_bind.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// #include -// #include "../g1.hpp" - -// #define WASM_EXPORT __attribute__((visibility("default"))) - -// extern "C" { - -// WASM_EXPORT void* bbmalloc(size_t size); - -// WASM_EXPORT void bbfree(void* ptr); - -// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points); - -// WASM_EXPORT void delete_pippenger(void* pippenger); - -// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* -// result_ptr); WASM_EXPORT void g1_sum(void* points_ptr, size_t num_points, void* result_ptr); -// } diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp deleted file mode 100644 index 08c6b62960..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "pippenger.hpp" -#include "barretenberg/srs/io.hpp" -namespace grumpkin { -namespace scalar_multiplication { - -Pippenger::Pippenger(g1::affine_element* points, size_t num_points) - : monomials_(points) - , num_points_(num_points) -{ - grumpkin::io::byteswap(&monomials_[0], num_points * 64); - scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -Pippenger::Pippenger(uint8_t const* points, size_t num_points) - : num_points_(num_points) -{ - monomials_ = point_table_alloc(num_points); - - grumpkin::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64); - grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -Pippenger::Pippenger(std::string const& path, size_t num_points) - : num_points_(num_points) -{ - monomials_ = point_table_alloc(num_points); - - grumpkin::io::read_transcript_g1(monomials_, num_points, path); - grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); -} - -g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range) -{ - scalar_multiplication::pippenger_runtime_state state(range); - return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state); -} - -Pippenger::~Pippenger() -{ - free(monomials_); -} - -} // namespace scalar_multiplication -} // namespace grumpkin diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp deleted file mode 100644 index a6c55f1bc4..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/pippenger.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#pragma once -#include "./scalar_multiplication.hpp" -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -namespace grumpkin { -namespace scalar_multiplication { - -inline size_t point_table_size(size_t num_points) -{ -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t prefetch_overflow = 16 * num_threads; - - return 2 * num_points + prefetch_overflow; -} - -template inline size_t point_table_buf_size(size_t num_points) -{ - return sizeof(T) * point_table_size(num_points); -} - -template inline T* point_table_alloc(size_t num_points) -{ - return (T*)aligned_alloc(64, point_table_buf_size(num_points)); -} - -class Pippenger { - public: - /** - * Expects points to be buffer of size as per point_table_size(). - * It expects the crs to start at points[1], and it fills in affine_one at points[0]. - * The crs undergoes a byteswap, and then the point table is generated. - */ - Pippenger(g1::affine_element* points, size_t num_points); - - Pippenger(uint8_t const* points, size_t num_points); - - Pippenger(std::string const& path, size_t num_points); - - ~Pippenger(); - - g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range); - - g1::affine_element* get_point_table() const { return monomials_; } - - size_t get_num_points() const { return num_points_; } - - private: - g1::affine_element* monomials_; - size_t num_points_; -}; - -} // namespace scalar_multiplication -} // namespace grumpkin diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp deleted file mode 100644 index f56bdaa936..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "process_buckets.hpp" - -#include - -namespace grumpkin { -namespace scalar_multiplication { -void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept -{ - constexpr size_t num_bits = 8; - constexpr size_t num_buckets = 1UL << num_bits; - constexpr uint32_t mask = static_cast(num_buckets) - 1U; - std::array bucket_counts{}; - - for (size_t i = 0; i < num_entries; ++i) { - bucket_counts[(keys[i] >> shift) & mask]++; - } - - std::array offsets; - std::array offsets_copy; - offsets[0] = 0; - - for (size_t i = 0; i < num_buckets - 1; ++i) { - bucket_counts[i + 1] += bucket_counts[i]; - } - for (size_t i = 1; i < num_buckets + 1; ++i) { - offsets[i] = bucket_counts[i - 1]; - } - for (size_t i = 0; i < num_buckets + 1; ++i) { - offsets_copy[i] = offsets[i]; - } - uint64_t* start = &keys[0]; - - for (size_t i = 0; i < num_buckets; ++i) { - uint64_t* bucket_start = &keys[offsets[i]]; - const uint64_t* bucket_end = &keys[offsets_copy[i + 1]]; - while (bucket_start != bucket_end) { - for (uint64_t* it = bucket_start; it < bucket_end; ++it) { - const size_t value = (*it >> shift) & mask; - const uint64_t offset = offsets[value]++; - std::iter_swap(it, start + offset); - } - bucket_start = &keys[offsets[i]]; - } - } - if (shift > 0) { - for (size_t i = 0; i < num_buckets; ++i) { - if (offsets_copy[i + 1] - offsets_copy[i] > 1) { - radix_sort(&keys[offsets_copy[i]], offsets_copy[i + 1] - offsets_copy[i], shift - 8); - } - } - } -} - -void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept -{ - const uint32_t bits_per_round = 8; - const uint32_t base = num_bits & 7; - const uint32_t total_bits = (base == 0) ? num_bits : num_bits - base + 8; - const uint32_t shift = total_bits - bits_per_round; - - radix_sort(wnaf_entries, num_entries, shift); -} -} // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp deleted file mode 100644 index d4ef31da06..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/process_buckets.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include -#include - -namespace grumpkin { -namespace scalar_multiplication { -void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept; - -void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept; -} // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp deleted file mode 100644 index 36d894eafa..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.cpp +++ /dev/null @@ -1,212 +0,0 @@ -#include "runtime_states.hpp" - -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" -#include "barretenberg/numeric/bitop/get_msb.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -namespace grumpkin { -namespace scalar_multiplication { - -pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points) -{ - constexpr size_t MAX_NUM_ROUNDS = 256; - num_points = num_initial_points * 2; - const size_t num_points_floor = static_cast(1ULL << (numeric::get_msb(num_points))); - const size_t num_buckets = static_cast( - 1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast(num_initial_points))); -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t prefetch_overflow = 16 * num_threads; - const size_t num_rounds = - static_cast(grumpkin::scalar_multiplication::get_num_rounds(static_cast(num_points_floor))); - point_schedule = (uint64_t*)(aligned_alloc( - 64, (static_cast(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t))); - skew_table = (bool*)(aligned_alloc(64, pad(static_cast(num_points) * sizeof(bool), 64))); - point_pairs_1 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - point_pairs_2 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - scratch_space = (fq*)(aligned_alloc(64, static_cast(num_points) * sizeof(g1::affine_element))); - bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); - bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); - bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool))); - round_counts = (uint64_t*)(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t))); - - const size_t points_per_thread = static_cast(num_points) / num_threads; -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_threads; ++i) { - const size_t thread_offset = i * points_per_thread; - memset((void*)(point_pairs_1 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(point_pairs_2 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq)); - for (size_t j = 0; j < num_rounds; ++j) { - const size_t round_offset = (j * static_cast(num_points)); - memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t)); - } - memset((void*)(skew_table + thread_offset), 0, points_per_thread * sizeof(bool)); - } - - memset((void*)bucket_counts, 0, num_threads * num_buckets * sizeof(uint32_t)); - memset((void*)bit_counts, 0, num_threads * num_buckets * sizeof(uint32_t)); - memset((void*)bucket_empty_status, 0, num_threads * num_buckets * sizeof(bool)); - memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t)); -} - -pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other) -{ - point_schedule = other.point_schedule; - skew_table = other.skew_table; - point_pairs_1 = other.point_pairs_1; - point_pairs_2 = other.point_pairs_2; - scratch_space = other.scratch_space; - bit_counts = other.bit_counts; - bucket_counts = other.bucket_counts; - bucket_empty_status = other.bucket_empty_status; - round_counts = other.round_counts; - - other.point_schedule = nullptr; - other.skew_table = nullptr; - other.point_pairs_1 = nullptr; - other.point_pairs_2 = nullptr; - other.scratch_space = nullptr; - other.bit_counts = nullptr; - other.bucket_counts = nullptr; - other.bucket_empty_status = nullptr; - other.round_counts = nullptr; - - num_points = other.num_points; -} - -pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other) -{ - if (point_schedule) { - aligned_free(point_schedule); - } - - if (skew_table) { - aligned_free(skew_table); - } - - if (point_pairs_1) { - aligned_free(point_pairs_1); - } - - if (point_pairs_2) { - aligned_free(point_pairs_2); - } - - if (scratch_space) { - aligned_free(scratch_space); - } - - if (bit_counts) { - aligned_free(bit_counts); - } - - if (bucket_counts) { - aligned_free(bucket_counts); - } - - if (bucket_empty_status) { - aligned_free(bucket_empty_status); - } - - if (round_counts) { - aligned_free(round_counts); - } - - point_schedule = other.point_schedule; - skew_table = other.skew_table; - point_pairs_1 = other.point_pairs_1; - point_pairs_2 = other.point_pairs_2; - scratch_space = other.scratch_space; - bit_counts = other.bit_counts; - bucket_counts = other.bucket_counts; - bucket_empty_status = other.bucket_empty_status; - round_counts = other.round_counts; - - other.point_schedule = nullptr; - other.skew_table = nullptr; - other.point_pairs_1 = nullptr; - other.point_pairs_2 = nullptr; - other.scratch_space = nullptr; - other.bit_counts = nullptr; - other.bucket_counts = nullptr; - other.bucket_empty_status = nullptr; - other.round_counts = nullptr; - - num_points = other.num_points; - return *this; -} - -affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads, - const size_t thread_index) -{ - const size_t points_per_thread = static_cast(num_points / num_threads); - const size_t num_buckets = static_cast( - 1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast(num_points) / 2)); - - scalar_multiplication::affine_product_runtime_state product_state; - - product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16); - product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16); - product_state.scratch_space = scratch_space + (thread_index * (points_per_thread / 2)); - product_state.bucket_counts = bucket_counts + (thread_index * (num_buckets)); - product_state.bit_offsets = bit_counts + (thread_index * (num_buckets)); - product_state.bucket_empty_status = bucket_empty_status + (thread_index * (num_buckets)); - return product_state; -} - -pippenger_runtime_state::~pippenger_runtime_state() -{ - if (point_schedule) { - aligned_free(point_schedule); - } - - if (skew_table) { - aligned_free(skew_table); - } - - if (point_pairs_1) { - aligned_free(point_pairs_1); - } - - if (point_pairs_2) { - aligned_free(point_pairs_2); - } - - if (scratch_space) { - aligned_free(scratch_space); - } - - if (bit_counts) { - aligned_free(bit_counts); - } - - if (bucket_counts) { - aligned_free(bucket_counts); - } - - if (bucket_empty_status) { - aligned_free(bucket_empty_status); - } - - if (round_counts) { - aligned_free(round_counts); - } -} -} // namespace scalar_multiplication -} // namespace grumpkin diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp deleted file mode 100644 index 050c955c8c..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/runtime_states.hpp +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include "../grumpkin.hpp" -#include "barretenberg/ecc/groups/wnaf.hpp" - -namespace grumpkin { -// simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm. -// This is to eliminate page faults when allocating (and writing) to large tranches of memory. -namespace scalar_multiplication { -constexpr size_t get_optimal_bucket_width(const size_t num_points) -{ - if (num_points >= 14617149) { - return 21; - } - if (num_points >= 1139094) { - return 18; - } - // if (num_points >= 100000) - if (num_points >= 155975) { - return 15; - } - if (num_points >= 144834) - // if (num_points >= 100000) - { - return 14; - } - if (num_points >= 25067) { - return 12; - } - if (num_points >= 13926) { - return 11; - } - if (num_points >= 7659) { - return 10; - } - if (num_points >= 2436) { - return 9; - } - if (num_points >= 376) { - return 7; - } - if (num_points >= 231) { - return 6; - } - if (num_points >= 97) { - return 5; - } - if (num_points >= 35) { - return 4; - } - if (num_points >= 10) { - return 3; - } - if (num_points >= 2) { - return 2; - } - return 1; -} - -constexpr size_t get_num_rounds(const size_t num_points) -{ - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - return WNAF_SIZE(bits_per_bucket + 1); -} - -// WORKTODO: uniformize -struct affine_product_runtime_state { - g1::affine_element* points; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; - uint32_t* bucket_counts; - uint32_t* bit_offsets; - uint64_t* point_schedule; - uint32_t num_points; - uint32_t num_buckets; - bool* bucket_empty_status; -}; - -struct pippenger_runtime_state { - uint64_t* point_schedule; - bool* skew_table; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; - uint32_t* bucket_counts; - uint32_t* bit_counts; - bool* bucket_empty_status; - uint64_t* round_counts; - uint64_t num_points; - - pippenger_runtime_state(const size_t num_initial_points); - pippenger_runtime_state(pippenger_runtime_state&& other); - pippenger_runtime_state& operator=(pippenger_runtime_state&& other); - ~pippenger_runtime_state(); - - affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index); -}; -} // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp deleted file mode 100644 index d61158b143..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.cpp +++ /dev/null @@ -1,947 +0,0 @@ -#include "./scalar_multiplication.hpp" - -#include "barretenberg/common/throw_or_abort.hpp" -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/max_threads.hpp" -#include "barretenberg/numeric/bitop/get_msb.hpp" - -#include -#include -#include -#include - -#include "../../../groups/wnaf.hpp" -#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" -#include "barretenberg/ecc/curves/bn254/fq.hpp" -#include "barretenberg/ecc/curves/bn254/fr.hpp" -#include "barretenberg/ecc/curves/bn254/g1.hpp" -#include "./process_buckets.hpp" -#include "./runtime_states.hpp" - -#ifndef NO_MULTITHREADING -#include -#endif - -#define BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 16] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 17] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 18] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 19] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 20] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 21] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 22] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 23] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 24] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 25] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 26] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 27] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 28] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 29] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 30] >> 32ULL)); \ - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 31] >> 32ULL)); \ - \ - uint64_t schedule_a = state.point_schedule[schedule_it]; \ - uint64_t schedule_b = state.point_schedule[schedule_it + 1]; \ - uint64_t schedule_c = state.point_schedule[schedule_it + 2]; \ - uint64_t schedule_d = state.point_schedule[schedule_it + 3]; \ - uint64_t schedule_e = state.point_schedule[schedule_it + 4]; \ - uint64_t schedule_f = state.point_schedule[schedule_it + 5]; \ - uint64_t schedule_g = state.point_schedule[schedule_it + 6]; \ - uint64_t schedule_h = state.point_schedule[schedule_it + 7]; \ - uint64_t schedule_i = state.point_schedule[schedule_it + 8]; \ - uint64_t schedule_j = state.point_schedule[schedule_it + 9]; \ - uint64_t schedule_k = state.point_schedule[schedule_it + 10]; \ - uint64_t schedule_l = state.point_schedule[schedule_it + 11]; \ - uint64_t schedule_m = state.point_schedule[schedule_it + 12]; \ - uint64_t schedule_n = state.point_schedule[schedule_it + 13]; \ - uint64_t schedule_o = state.point_schedule[schedule_it + 14]; \ - uint64_t schedule_p = state.point_schedule[schedule_it + 15]; \ - \ - g1::conditional_negate_affine( \ - state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ - state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL), \ - state.point_pairs_1 + current_offset + 10, \ - (schedule_k >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL), \ - state.point_pairs_1 + current_offset + 11, \ - (schedule_l >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL), \ - state.point_pairs_1 + current_offset + 12, \ - (schedule_m >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL), \ - state.point_pairs_1 + current_offset + 13, \ - (schedule_n >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL), \ - state.point_pairs_1 + current_offset + 14, \ - (schedule_o >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL), \ - state.point_pairs_1 + current_offset + 15, \ - (schedule_p >> 31ULL) & 1ULL); \ - \ - current_offset += 16; \ - schedule_it += 16; - -namespace grumpkin { -namespace scalar_multiplication { - -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points) -{ - // iterate backwards, so that `points` and `table` can point to the same memory location - fq beta = fq::cube_root_of_unity(); - for (size_t i = num_points - 1; i < num_points; --i) { - table[i * 2] = points[i]; - table[i * 2 + 1].x = beta * points[i].x; - table[i * 2 + 1].y = -points[i].y; - } -} - -/** - * Compute the windowed-non-adjacent-form versions of our scalar multipliers. - * - * We start by splitting our 254 bit scalars into 2 127-bit scalars, using the short weierstrass curve endomorphism - * (for a point P \in \G === (x, y) \in \Fq, then (\beta x, y) = (\lambda) * P , where \beta = 1^{1/3} mod Fq and - *\lambda = 1^{1/3} mod Fr) (which means we can represent a scalar multiplication (k * P) as (k1 * P + k2 * \lambda * - *P), where k1, k2 have 127 bits) (see field::split_into_endomorphism_scalars for more details) - * - * Once we have our 127-bit scalar multipliers, we determine the optimal number of pippenger rounds, given the number of - *points we're multiplying. Once we have the number of rounds, `m`, we need to split our scalar into `m` bit-slices. - *Each pippenger round will work on one bit-slice. - * - * Pippenger's algorithm works by, for each round, iterating over the points we're multplying. For each point, we - *examing the point's scalar multiplier and extract the bit-slice associated with the current pippenger round (we start - *with the most significant slice). We then use the bit-slice to index a 'bucket', which we add the point into. For - *example, if the bit slice is 01101, we add the corresponding point into bucket[13]. - * - * At the end of each pippenger round we concatenate the buckets together. E.g. if we have 8 buckets, we compute: - * sum = bucket[0] + 2 * bucket[1] + 3 * bucket[2] + 4 * bucket[3] + 5 * bucket[4] + 6 * bucket[5] + 7 * bucket[6] + 8 * - *bucket[7]. - * - * At the end of each pippenger round, the bucket sum will contain the scalar multiplication result for one bit slice. - * For example, say we have 16 rounds, where each bit slice contains 8 bits (8 * 16 = 128, enough to represent our 127 - *bit scalars). At the end of the first round, we will have taken the 8 most significant bits from every scalar - *multiplier. Our bucket sum will be the result of a mini-scalar-multiplication, where we have multiplied every point by - *the 8 most significant bits of each point's scalar multiplier. - * - * We repeat this process for every pippenger round. In our example, this gives us 16 bucket sums. - * We need to multiply the most significant bucket sum by 2^{120}, the second most significant bucket sum by 2^{112} - *etc. Once this is done we can add the bucket sums together, to evaluate our scalar multiplication result. - * - * Pippenger has complexity O(n / logn), because of two factors at play: the number of buckets we need to concatenate - *per round, and the number of points we need to add into buckets per round. - * - * To minimize the number of point additions per round, we want fewer rounds. But fewer rounds increases the number of - *bucket concatenations. The more points we have, the greater the time saving when reducing the number of rounds, which - *means we can afford to have more buckets per round. - * - * For a concrete example, with 2^20 points, the sweet spot is 2^15 buckets - with 2^15 buckets we can evaluate our 127 - *bit scalar multipliers in 8 rounds (we can represent b-bit windows with 2^{b-1} buckets, more on that below). - * - * This means that, for each round, we add 2^21 points into buckets (we've split our scalar multpliers into two - *half-width multipliers, so each round has twice the number of points. This is the reason why the endormorphism is - *useful here; without the endomorphism, we would need twice the number of buckets for each round). - * - * We also concatenate 2^15 buckets for each round. This requires 2^16 point additions. - * - * Meaning that the total number of point additions is (8 * 2^21) + (8 * 2^16) = 33 * 2^19 ~ 2^24 point additions. - * If we were to use a simple Montgomery double-and-add ladder to exponentiate each point, we would need 2^27 point - *additions (each scalar multiplier has ~2^7 non-zero bits, and there are 2^20 points). - * - * This makes pippenger 8 times faster than the naive O(n) equivalent. Given that a circuit with 1 million gates will - *require 9 multiple-scalar-multiplications with 2^20 points, efficiently using Pippenger's algorithm is essential for - *fast provers - * - * One additional efficiency gain is the use of 2^{b-1} buckets to represent b bits. To do this we represent our - *bit-slices in non-adjacent form. Non-adjacent form represents values using a base, where each 'bit' can take the - *values (-1, 0, 1). This is considerably more efficient than binary form for scalar multiplication, as inverting a - *point can be done by negating the y-coordinate. - * - * We actually use a slightly different representation than simple non-adjacent form. To represent b bits, a bit slice - *contains values from (-2^{b} - 1, ..., -1, 1, ..., 2^{b} - 1). i.e. we only have odd values. We do this to eliminate - *0-valued windows, as having a conditional branch in our hot loop to check if an entry is 0 is somethin we want to - *avoid. - * - * The above representation can be used to represent any binary number as long as we add a 'skew' factor. Each scalar - *multiplier's `skew` tracks if the scalar multiplier is even or odd. If it's even, `skew = true`, and we add `1` to our - *multiplier to make it odd. - * - * We then, at the end of the Pippenger algorithm, subtract a point from the total result, if that point's skew is - *`true`. - * - * At the end of `compute_wnaf_states`, `state.wnaf_table` will contain our wnaf entries, but unsorted. - * - * @param point_schedule Pointer to the output array with all WNAFs - * @param input_skew_table Pointer to the output array with all skews - * @param round_counts The number of points in each round - * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF - * @param num_initial_points The number of points before the endomorphism split - **/ -void compute_wnaf_states(uint64_t* point_schedule, - bool* input_skew_table, - uint64_t* round_counts, - const fr* scalars, - const size_t num_initial_points) -{ - const size_t num_points = num_initial_points * 2; - constexpr size_t MAX_NUM_ROUNDS = 256; - constexpr size_t MAX_NUM_THREADS = 128; - const size_t num_rounds = get_num_rounds(num_points); - const size_t bits_per_bucket = get_optimal_bucket_width(num_initial_points); - const size_t wnaf_bits = bits_per_bucket + 1; -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t num_initial_points_per_thread = num_initial_points / num_threads; - const size_t num_points_per_thread = num_points / num_threads; - std::array, MAX_NUM_THREADS> thread_round_counts; - for (size_t i = 0; i < num_threads; ++i) { - for (size_t j = 0; j < num_rounds; ++j) { - thread_round_counts[i][j] = 0; - } - } -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_threads; ++i) { - fr T0; - uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread]; - const fr* thread_scalars = &scalars[i * num_initial_points_per_thread]; - bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread]; - uint64_t offset = i * num_points_per_thread; - - for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) { - T0 = thread_scalars[j].from_montgomery_form(); - fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]); - - barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[0], - &wnaf_table[(j << 1UL)], - skew_table[j << 1ULL], - &thread_round_counts[i][0], - ((j << 1ULL) + offset) << 32ULL, - num_points, - wnaf_bits); - barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[2], - &wnaf_table[(j << 1UL) + 1], - skew_table[(j << 1UL) + 1], - &thread_round_counts[i][0], - ((j << 1UL) + offset + 1) << 32UL, - num_points, - wnaf_bits); - } - } - - for (size_t i = 0; i < num_rounds; ++i) { - round_counts[i] = 0; - } - for (size_t i = 0; i < num_threads; ++i) { - for (size_t j = 0; j < num_rounds; ++j) { - round_counts[j] += thread_round_counts[i][j]; - } - } -} - -/** - * Sorts our wnaf entries in increasing bucket order (per round). - * We currently don't multi-thread the inner sorting algorithm, and just split our threads over the number of rounds. - * A multi-threaded sorting algorithm could be more efficient, but the total runtime of `organize_buckets` is <5% of - * pippenger's runtime, so not a priority. - **/ -void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t num_points) -{ - const size_t num_rounds = get_num_rounds(num_points); -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_rounds; ++i) { - scalar_multiplication::process_buckets(&point_schedule[i * num_points], - num_points, - static_cast(get_optimal_bucket_width(num_points / 2)) + 1); - } -} - -/** - * adds a bunch of points together using affine addition formulae. - * Paradoxically, the affine formula is crazy efficient if you have a lot of independent point additions to perform. - * Affine formula: - * - * \lambda = (y_2 - y_1) / (x_2 - x_1) - * x_3 = \lambda^2 - (x_2 + x_1) - * y_3 = \lambda*(x_1 - x_3) - y_1 - * - * Traditionally, we avoid affine formulae like the plague, because computing lambda requires a modular inverse, - * which is outrageously expensive. - * - * However! We can use Montgomery's batch inversion technique to amortise the cost of the inversion to ~0. - * - * The way batch inversion works is as follows. Let's say you want to compute \{ 1/x_1, 1/x_2, ..., 1/x_n \} - * The trick is to compute the product x_1x_2...x_n , whilst storing all of the temporary products. - * i.e. we have an array A = [x_1, x_1x_2, ..., x_1x_2...x_n] - * We then compute a single inverse: I = 1 / x_1x_2...x_n - * Finally, we can use our accumulated products, to quotient out individual inverses. - * We can get an individual inverse at index i, by computing I.A_{i-1}.(x_nx_n-1...x_i+1) - * The last product term we can compute on-the-fly, as it grows by one element for each additional inverse that we - * require. - * - * TLDR: amortized cost of a modular inverse is 3 field multiplications per inverse. - * Which means we can compute a point addition with SIX field multiplications in total. - * The traditional Jacobian-coordinate formula requires 11. - * - * There is a catch though - we need large sequences of independent point additions! - * i.e. the output from one point addition in the sequence is NOT an input to any other point addition in the sequence. - * - * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated - **/ -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space) -{ - fq batch_inversion_accumulator = fq::one(); - - for (size_t i = 0; i < num_points; i += 2) { - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 - points[i + 1].x -= points[i].x; // x2 - x1 - points[i + 1].y -= points[i].y; // y2 - y1 - points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); - } - - if (batch_inversion_accumulator == 0) { - throw_or_abort("attempted to invert zero in add_affine_points"); - } else { - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - } - - for (size_t i = (num_points)-2; i < num_points; i -= 2) { - // Memory bandwidth is a bit of a bottleneck here. - // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching - __builtin_prefetch(points + i - 2); - __builtin_prefetch(points + i - 1); - __builtin_prefetch(points + ((i + num_points - 2) >> 1)); - __builtin_prefetch(scratch_space + ((i - 2) >> 1)); - - points[i + 1].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= points[i + 1].x; - points[i + 1].x = points[i + 1].y.sqr(); - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2 - // - x1 - points[i].x -= points[(i + num_points) >> 1].x; - points[i].x *= points[i + 1].y; - points[(i + num_points) >> 1].y = points[i].x - points[i].y; - } -} - -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space) -{ - fq batch_inversion_accumulator = fq::one(); - - for (size_t i = 0; i < num_points; i += 2) { - if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) { - continue; - } - if (points[i].x == points[i + 1].x) { - if (points[i].y == points[i + 1].y) { - // double - scratch_space[i >> 1] = points[i].x + points[i].x; // 2x - fq x_squared = points[i].x.sqr(); - points[i + 1].x = points[i].y + points[i].y; // 2y - points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2 - points[i + 1].y *= batch_inversion_accumulator; - batch_inversion_accumulator *= (points[i + 1].x); - continue; - } - points[i].self_set_infinity(); - points[i + 1].self_set_infinity(); - continue; - } - - scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 - points[i + 1].x -= points[i].x; // x2 - x1 - points[i + 1].y -= points[i].y; // y2 - y1 - points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old - batch_inversion_accumulator *= (points[i + 1].x); - } - if (!batch_inversion_accumulator.is_zero()) { - batch_inversion_accumulator = batch_inversion_accumulator.invert(); - } - for (size_t i = (num_points)-2; i < num_points; i -= 2) { - // Memory bandwidth is a bit of a bottleneck here. - // There's probably a more elegant way of structuring our data so we don't need to do all of this prefetching - __builtin_prefetch(points + i - 2); - __builtin_prefetch(points + i - 1); - __builtin_prefetch(points + ((i + num_points - 2) >> 1)); - __builtin_prefetch(scratch_space + ((i - 2) >> 1)); - - if (points[i].is_point_at_infinity()) { - points[(i + num_points) >> 1] = points[i + 1]; - continue; - } - if (points[i + 1].is_point_at_infinity()) { - points[(i + num_points) >> 1] = points[i]; - continue; - } - - points[i + 1].y *= batch_inversion_accumulator; // update accumulator - batch_inversion_accumulator *= points[i + 1].x; - points[i + 1].x = points[i + 1].y.sqr(); - points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2 - // - x1 - points[i].x -= points[(i + num_points) >> 1].x; - points[i].x *= points[i + 1].y; - points[(i + num_points) >> 1].y = points[i].x - points[i].y; - } -} - -/** - * evaluate a chain of pairwise additions. - * The additions are sequenced into base-2 segments - * i.e. pairs, pairs of pairs, pairs of pairs of pairs etc - * `max_bucket_bits` indicates the largest set of nested pairs in the array, - * which defines the iteration depth - **/ -void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases) -{ - size_t end = state.num_points; - size_t start = 0; - for (size_t i = 0; i < max_bucket_bits; ++i) { - const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i); - start = end - points_in_round; - if (handle_edge_cases) { - add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space); - } else { - add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space); - } - } -} - -/** - * This is the entry point for our 'find a way of evaluating a giant multi-product using affine coordinates' algorithm - * By this point, we have already sorted our pippenger buckets. So we have the following situation: - * - * 1. We have a defined number of buckets points - * 2. We have a defined number of points, that need to be added into these bucket points - * 3. number of points >> number of buckets - * - * The algorithm begins by counting the number of points assigned to each bucket. - * For each bucket, we then take this count and split it into its base-2 components. - * e.g. if bucket[3] has 14 points, we split that into a sequence of (8, 4, 2) - * This base-2 splitting is useful, because we can take the bucket's associated points, and - * sort them into pairs, quads, octs etc. These mini-addition sequences are independent from one another, - * which means that we can use the affine trick to evaluate them. - * Once we're done, we have effectively reduced the number of points in the bucket to a logarithmic factor of the input. - * e.g. in the above example, once we've evaluated our pairwise addition of 8, 4 and 2 elements, - * we're left with 3 points. - * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points. - * We repeat this process until every bucket only has one point assigned to it. - **/ -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases) -{ - - // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now(); - // This method sorts our points into our required base-2 sequences. - // `max_bucket_bits` is log2(maximum bucket count). - // This sets the upper limit on how many iterations we need to perform in `evaluate_addition_chains`. - // e.g. if `max_bucket_bits == 3`, then we have at least one bucket with >= 8 points in it. - // which means we need to repeat our pairwise addition algorithm 3 times - // (e.g. add 4 pairs together to get 2 pairs, add those pairs together to get a single pair, which we add to reduce - // to our final point) - const size_t max_bucket_bits = construct_addition_chains(state, first_round); - - // if max_bucket_bits is 0, we're done! we can return - if (max_bucket_bits == 0) { - return state.point_pairs_1; - } - - // compute our required additions using the affine trick - evaluate_addition_chains(state, max_bucket_bits, handle_edge_cases); - - // this next step is a processing step, that computes a new point schedule for our reduced points. - // In the pippenger algorithm, we use a 64-bit uint to categorize each point. - // The high 32 bits describes the position of the point in a point array. - // The low 31 bits describes the bucket index that the point maps to - // The 32nd bit defines whether the point is actually a negation of our stored point. - - // We want to compute these 'point schedule' uints for our reduced points, so that we can recurse back into - // `reduce_buckets` - uint32_t start = 0; - const uint32_t end = static_cast(state.num_points); - // The output of `evaluate_addition_chains` has a bit of an odd structure, should probably refactor. - // Effectively, we used to have one big 1d array, and the act of computing these pair-wise point additions - // has chopped it up into sequences of smaller 1d arrays, with gaps in between - for (size_t i = 0; i < max_bucket_bits; ++i) { - const uint32_t points_in_round = - (static_cast(state.num_points) - state.bit_offsets[i + 1]) >> static_cast(i); - const uint32_t points_removed = points_in_round / 2; - - start = end - points_in_round; - const uint32_t modified_start = start + points_removed; - state.bit_offsets[i + 1] = modified_start; - } - - // iterate over each bucket. Identify how many remaining points there are, and compute their point scheduels - uint32_t new_num_points = 0; - for (size_t i = 0; i < state.num_buckets; ++i) { - uint32_t& count = state.bucket_counts[i]; - uint32_t num_bits = numeric::get_msb(count) + 1; - uint32_t new_bucket_count = 0; - for (size_t j = 0; j < num_bits; ++j) { - uint32_t& current_offset = state.bit_offsets[j]; - const bool has_entry = ((count >> j) & 1) == 1; - if (has_entry) { - uint64_t schedule = (static_cast(current_offset) << 32ULL) + i; - state.point_schedule[new_num_points++] = schedule; - ++new_bucket_count; - ++current_offset; - } - } - count = new_bucket_count; - } - - // modify `num_points` to reflect the new number of reduced points. - // also swap around the `point_pairs` pointer; what used to be our temporary array - // has now become our input point array - g1::affine_element* temp = state.point_pairs_1; - state.num_points = new_num_points; - state.points = state.point_pairs_1; - state.point_pairs_1 = state.point_pairs_2; - state.point_pairs_2 = temp; - - // We could probably speed this up by unroling the recursion. - // But each extra call to `reduce_buckets` has an input size that is ~log(previous input size) - // so the extra run-time is meh - return reduce_buckets(state, false, handle_edge_cases); -} - -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts) -{ - // if this is the first call to `construct_addition_chains`, we need to count up our buckets - if (empty_bucket_counts) { - memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets); - const uint32_t first_bucket = static_cast(state.point_schedule[0] & 0x7fffffffUL); - for (size_t i = 0; i < state.num_points; ++i) { - size_t bucket_index = static_cast(state.point_schedule[i] & 0x7fffffffUL); - ++state.bucket_counts[bucket_index - first_bucket]; - } - for (size_t i = 0; i < state.num_buckets; ++i) { - state.bucket_empty_status[i] = (state.bucket_counts[i] == 0); - } - } - - uint32_t max_count = 0; - for (size_t i = 0; i < state.num_buckets; ++i) { - max_count = state.bucket_counts[i] > max_count ? state.bucket_counts[i] : max_count; - } - - const uint32_t max_bucket_bits = numeric::get_msb(max_count); - - for (size_t i = 0; i < max_bucket_bits + 1; ++i) { - state.bit_offsets[i] = 0; - } - - // theoretically, can be unrolled using templated methods. - // However, explicitly unrolling the loop by using recursive template calls was slower! - // Inner loop is currently bounded by a constexpr variable, need to see what the compiler does with that... - count_bits(state.bucket_counts, &state.bit_offsets[0], state.num_buckets, max_bucket_bits); - - // we need to update `bit_offsets` to compute our point shuffle, - // but we need the original array later on, so make a copy. - std::array bit_offsets_copy = { 0 }; - for (size_t i = 0; i < max_bucket_bits + 1; ++i) { - bit_offsets_copy[i] = state.bit_offsets[i]; - } - - // this is where we take each bucket's associated points, and arrange them - // in a pairwise order, so that we can compute large sequences of additions using the affine trick - size_t schedule_it = 0; - uint32_t* bucket_count_it = state.bucket_counts; - - for (size_t i = 0; i < state.num_buckets; ++i) { - uint32_t count = *bucket_count_it; - ++bucket_count_it; - uint32_t num_bits = numeric::get_msb(count) + 1; - for (size_t j = 0; j < num_bits; ++j) { - uint32_t& current_offset = bit_offsets_copy[j]; - const size_t k_end = count & (1UL << j); - // This section is a bottleneck - to populate our point array, we need - // to read from memory locations that are effectively uniformly randomly distributed! - // (assuming our scalar multipliers are uniformly random...) - // In the absence of a more elegant solution, we use ugly macro hacks to try and - // unroll loops, and prefetch memory a few cycles before we need it - switch (k_end) { - case 64: { - [[fallthrough]]; - } - case 32: { - [[fallthrough]]; - } - case 16: { - for (size_t k = 0; k < (k_end >> 4); ++k) { - BBERG_SCALAR_MULTIPLICATION_FETCH_BLOCK; - } - break; - } - case 8: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 8] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 9] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 10] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 11] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 12] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 13] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 14] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 15] >> 32ULL)); - - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - const uint64_t schedule_c = state.point_schedule[schedule_it + 2]; - const uint64_t schedule_d = state.point_schedule[schedule_it + 3]; - const uint64_t schedule_e = state.point_schedule[schedule_it + 4]; - const uint64_t schedule_f = state.point_schedule[schedule_it + 5]; - const uint64_t schedule_g = state.point_schedule[schedule_it + 6]; - const uint64_t schedule_h = state.point_schedule[schedule_it + 7]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL), - state.point_pairs_1 + current_offset + 4, - (schedule_e >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL), - state.point_pairs_1 + current_offset + 5, - (schedule_f >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL), - state.point_pairs_1 + current_offset + 6, - (schedule_g >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL), - state.point_pairs_1 + current_offset + 7, - (schedule_h >> 31ULL) & 1ULL); - - current_offset += 8; - schedule_it += 8; - break; - } - case 4: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - const uint64_t schedule_c = state.point_schedule[schedule_it + 2]; - const uint64_t schedule_d = state.point_schedule[schedule_it + 3]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); - current_offset += 4; - schedule_it += 4; - break; - } - case 2: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - current_offset += 2; - schedule_it += 2; - break; - } - case 1: { - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 4] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 5] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 6] >> 32ULL)); - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); - const uint64_t schedule_a = state.point_schedule[schedule_it]; - - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - ++current_offset; - ++schedule_it; - break; - } - case 0: { - break; - } - default: { - for (size_t k = 0; k < k_end; ++k) { - uint64_t schedule = state.point_schedule[schedule_it]; - __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 1] >> 32ULL)); - - const uint64_t predicate = (schedule >> 31UL) & 1UL; - - g1::conditional_negate_affine( - state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate); - ++current_offset; - ++schedule_it; - } - } - } - } - } - return max_bucket_bits; -} - -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases) -{ - const size_t num_rounds = get_num_rounds(num_points); -#ifndef NO_MULTITHREADING - const size_t num_threads = max_threads::compute_num_threads(); -#else - const size_t num_threads = 1; -#endif - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - - std::unique_ptr thread_accumulators( - static_cast(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free); - -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t j = 0; j < num_threads; ++j) { - thread_accumulators[j].self_set_infinity(); - - for (size_t i = 0; i < num_rounds; ++i) { - - const uint64_t num_round_points = state.round_counts[i]; - - g1::element accumulator; - accumulator.self_set_infinity(); - - if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) { - } else { - - const uint64_t num_round_points_per_thread = num_round_points / num_threads; - const uint64_t leftovers = - (j == num_threads - 1) ? (num_round_points) - (num_round_points_per_thread * num_threads) : 0; - - uint64_t* thread_point_schedule = - &state.point_schedule[(i * num_points) + j * num_round_points_per_thread]; - const size_t first_bucket = thread_point_schedule[0] & 0x7fffffffU; - const size_t last_bucket = - thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU; - const size_t num_thread_buckets = (last_bucket - first_bucket) + 1; - - affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j); - product_state.num_points = static_cast(num_round_points_per_thread + leftovers); - product_state.points = points; - product_state.point_schedule = thread_point_schedule; - product_state.num_buckets = static_cast(num_thread_buckets); - g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases); - g1::element running_sum; - running_sum.self_set_infinity(); - - // one nice side-effect of the affine trick, is that half of the bucket concatenation - // algorithm can use mixed addition formulae, instead of full addition formulae - size_t output_it = product_state.num_points - 1; - for (size_t k = num_thread_buckets - 1; k > 0; --k) { - if (__builtin_expect(!product_state.bucket_empty_status[k], 1)) { - running_sum += (output_buckets[output_it]); - --output_it; - } - accumulator += running_sum; - } - running_sum += output_buckets[0]; - accumulator.self_dbl(); - accumulator += running_sum; - - // we now need to scale up 'running sum' up to the value of the first bucket. - // e.g. if first bucket is 0, no scaling - // if first bucket is 1, we need to add (2 * running_sum) - if (first_bucket > 0) { - uint32_t multiplier = static_cast(first_bucket << 1UL); - size_t shift = numeric::get_msb(multiplier); - g1::element rolling_accumulator = g1::point_at_infinity; - bool init = false; - while (shift != static_cast(-1)) { - if (init) { - rolling_accumulator.self_dbl(); - if (((multiplier >> shift) & 1)) { - rolling_accumulator += running_sum; - } - } else { - rolling_accumulator += running_sum; - } - init = true; - shift -= 1; - } - accumulator += rolling_accumulator; - } - } - - if (i == (num_rounds - 1)) { - const size_t num_points_per_thread = num_points / num_threads; - bool* skew_table = &state.skew_table[j * num_points_per_thread]; - g1::affine_element* point_table = &points[j * num_points_per_thread]; - g1::affine_element addition_temporary; - for (size_t k = 0; k < num_points_per_thread; ++k) { - if (skew_table[k]) { - addition_temporary = -point_table[k]; - accumulator += addition_temporary; - } - } - } - - if (i > 0) { - for (size_t k = 0; k < bits_per_bucket + 1; ++k) { - thread_accumulators[j].self_dbl(); - } - } - thread_accumulators[j] += accumulator; - } - } - - g1::element result; - result.self_set_infinity(); - for (size_t i = 0; i < num_threads; ++i) { - result += thread_accumulators[i]; - } - return result; -} - -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) -{ - // multiplication_runtime_state state; - compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2); - g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases); - return result; -} - -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) -{ - // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points. - // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm. - // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger -#ifndef NO_MULTITHREADING - const size_t threshold = std::max(max_threads::compute_num_threads() * 8, 8UL); -#else - const size_t threshold = 8UL; -#endif - - if (num_initial_points == 0) { - g1::element out = g1::one; - out.self_set_infinity(); - return out; - } - - if (num_initial_points <= threshold) { - std::vector exponentiation_results(num_initial_points); - // might as well multithread this... - // Possible optimization: use group::batch_mul_with_endomorphism here. -#ifndef NO_MULTITHREADING -#pragma omp parallel for -#endif - for (size_t i = 0; i < num_initial_points; ++i) { - exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i]; - } - - for (size_t i = num_initial_points - 1; i > 0; --i) { - exponentiation_results[i - 1] += exponentiation_results[i]; - } - return exponentiation_results[0]; - } - - const size_t slice_bits = static_cast(numeric::get_msb(static_cast(num_initial_points))); - const size_t num_slice_points = static_cast(1ULL << slice_bits); - - g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases); - - if (num_slice_points != num_initial_points) { - const uint64_t leftover_points = num_initial_points - num_slice_points; - return result + pippenger(scalars + num_slice_points, - points + static_cast(num_slice_points * 2), - static_cast(leftover_points), - state, - handle_edge_cases); - } else { - return result; - } -} - -/** - * It's pippenger! But this one has go-faster stripes and a prediliction for questionable life choices. - * We use affine-addition formula in this method, which paradoxically is ~45% faster than the mixed addition formulae. - * See `scalar_multiplication.cpp` for a more detailed description. - * - * It's...unsafe, because we assume that the incomplete addition formula exceptions are not triggered. - * We don't bother to check for this to avoid conditional branches in a critical section of our code. - * This is fine for situations where your bases are linearly independent (i.e. KZG10 polynomial commitments), - * because triggering the incomplete addition exceptions is about as hard as solving the disrete log problem. - * - * This is ok for the prover, but GIANT RED CLAXON WARNINGS FOR THE VERIFIER - * Don't use this in a verification algorithm! That would be a really bad idea. - * Unless you're a malicious adversary, then it would be a great idea! - * - **/ -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) -{ - return pippenger(scalars, points, num_initial_points, state, false); -} -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) -{ - std::vector G_mod(num_initial_points * 2); - grumpkin::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points); - return pippenger(scalars, &G_mod[0], num_initial_points, state, false); -} -} // namespace scalar_multiplication -} // namespace grumpkin diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp deleted file mode 100644 index a9a5c9c89d..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.hpp +++ /dev/null @@ -1,154 +0,0 @@ -#pragma once - -#include "../grumpkin.hpp" -#include "./runtime_states.hpp" -#include -#include - -namespace grumpkin { -namespace scalar_multiplication { - -constexpr size_t get_num_buckets(const size_t num_points) -{ - const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - return 1UL << bits_per_bucket; -} - -/** - * pointers that describe how to add points into buckets, for the pippenger algorithm. - * `wnaf_table` is an unrolled two-dimensional array, with each inner array being of size `n`, - * where `n` is the number of points being multiplied. The second dimension size is defined by - * the number of pippenger rounds (fixed for a given `n`, see `get_num_rounds`) - * - * An entry of `wnaf_table` contains the following three pieces of information: - * 1: the point index that we're working on. This is stored in the high 32 bits - * 2: the bucket index that we're adding the point into. This is stored in the low 31 bits - * 3: the sign of the point we're adding (i.e. do we actually need to subtract). This is stored in the 32nd bit. - * - * We pack this information into a 64 bit unsigned integer, so that we can more efficiently sort our wnaf entries. - * For a given round, we want to sort our wnaf entries in increasing bucket index order. - * - * This is so that we can efficiently use multiple threads to execute the pippenger algorithm. - * For a given round, a given point's bucket index will be uniformly randomly distributed, - * assuming the inputs are from a zero-knowledge proof. This is because the scalar multiplier will be uniformly randomly - *distributed, and the bucket indices are derived from the scalar multiplier. - * - * This means that, if we were to iterate over all of our points in order, and add each point into its associated - *bucket, we would be accessing all of our buckets in a completely random pattern. - * - * Aside from memory latency problems this incurs, this makes the naive algorithm unsuitable for multithreading - we - *cannot assign a thread a tranche of points, because each thread will be adding points into the same set of buckets, - *triggering race conditions. We do not want to manage the overhead of thread locks for each bucket; the process of - *adding a point into a bucket takes, on average, only 400 CPU cycles, so the slowdown of managing mutex locks would add - *considerable overhead. - * - * The solution is to sort the buckets. If the buckets are sorted, we can assign a tranche of buckets to individual - *threads, safe in the knowledge that there will be no race conditions, with one condition. A thread's starting bucket - *may be equal to the previous thread's end bucket, so we need to ensure that each thread works on a local array of - *buckets. This adds little overhead (for 2^20 points, we have 32,768 buckets. With 8 threads, the amount of bucket - *overlap is ~16 buckets, so we could incur 16 extra 'additions' in pippenger's bucket concatenation phase, but this is - *an insignificant contribution). - * - * The alternative approach (the one we used to use) is to slice up all of the points being multiplied amongst all - *available threads, and run the complete pippenger algorithm for each thread. This is suboptimal, because the - *complexity of pippenger is O(n / logn) point additions, and a sequence of smaller pippenger calls will have a smaller - *`n`. - * - * This is the motivation for multi-threading the actual Pippenger algorithm. In addition, the above approach performs - *extremely poorly for GPUs, where the number of threads can be as high as 2^10 (for a multi-scalar-multiplication of - *2^20 points, this doubles the number of pippenger rounds per thread) - * - * To give concrete numbers, the difference between calling pippenger on 2^20 points, and calling pippenger 8 times on - *2^17 points, is 5-10%. Which means that, for 8 threads, we need to ensure that our sorting algorithm adds less than 5% - *to the total runtime of pippenger. Given a single cache miss per point would increase the run-time by 25%, this is not - *much room to work with! - * - * However, a radix sort, combined with the fact that the total number of buckets is quite small (2^16 at most), seems - *to be fast enough. Benchmarks indicate (i7-8650U, 8 threads) that, for 2^20 points, the total runtime is <1200ms and - *of that, the radix sort consumes 58ms (4.8%) - * - * One advantage of sorting by bucket order vs point order, is that a 'bucket' is 96 bytes large (sizeof(g1::element), - *buckets have z-coordinates). Points, on the other hand, are 64 bytes large (affine points, no z-coordinate). This - *makes fetching random point locations in memory more efficient than fetching random bucket locations, as each point - *occupies a single cache line. Using __builtin_prefetch to recover the point just before it's needed, seems to improve - *the runtime of pippenger by 10-20%. - * - * Finally, `skew_table` tracks whether a scalar multplier is even or odd - * (if it's even, we need to subtract the point from the total result, - * because our windowed non-adjacent form values can only be odd) - * - **/ - -struct multiplication_thread_state { - g1::element* buckets; - const uint64_t* point_schedule; -}; - -void compute_wnaf_states(uint64_t* point_schedule, - bool* input_skew_table, - uint64_t* round_counts, - const fr* scalars, - const size_t num_initial_points); - -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points); - -void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points); - -inline void count_bits(uint32_t* bucket_counts, - uint32_t* bit_offsets, - const uint32_t num_buckets, - const size_t num_bits) -{ - for (size_t i = 0; i < num_buckets; ++i) { - const uint32_t count = bucket_counts[i]; - for (uint32_t j = 0; j < num_bits; ++j) { - bit_offsets[j + 1] += (count & (1U << j)); - } - } - bit_offsets[0] = 0; - for (size_t i = 2; i < num_bits + 1; ++i) { - bit_offsets[i] += bit_offsets[i - 1]; - } -} - -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true); - -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space); -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space); - -void evaluate_addition_chains(affine_product_runtime_state& state, - const size_t max_bucket_bits, - bool handle_edge_cases); - -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases); - -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases = false); - -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, - bool first_round = true, - bool handle_edge_cases = false); - -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_points, - pippenger_runtime_state& state, - bool handle_edge_cases = true); - -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); - -} // namespace scalar_multiplication -} // namespace grumpkin diff --git a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp deleted file mode 100644 index fa2e5f15c6..0000000000 --- a/cpp/src/barretenberg/ecc/curves/grumpkin/scalar_multiplication/scalar_multiplication.test.cpp +++ /dev/null @@ -1,946 +0,0 @@ -#include -#include -#include - -#include "pippenger.hpp" -#include "scalar_multiplication.hpp" -#include "barretenberg/common/mem.hpp" -#include "barretenberg/common/test.hpp" -#include "barretenberg/srs/io.hpp" -#include "barretenberg/numeric/random/engine.hpp" - -// paths are relative to cpp/build/ -std::string GRUMPKIN_SRS_PATH = "../srs_db/grumpkin"; - -using namespace grumpkin; -using namespace grumpkin::scalar_multiplication; - -namespace { -auto& engine = numeric::random::get_debug_engine(); -} - -TEST(grumpkin_scalar_multiplication, fake_transcript_io) -{ - size_t file_num = 0; - std::string transcript_path = io::get_transcript_path(GRUMPKIN_SRS_PATH, file_num); - - std::vector srs(3); - grumpkin::io::read_transcript_g1(&srs[0], /*degree=*/3, GRUMPKIN_SRS_PATH); - // the SRS is [x^i]_1 where x = 2 - EXPECT_EQ(static_cast(g1::one), srs[0]); - EXPECT_EQ(static_cast(g1::one + g1::one), srs[1]); - EXPECT_EQ(static_cast(g1::one + g1::one + g1::one + g1::one), srs[2]); -} - -TEST(grumpkin_scalar_multiplication, reduce_buckets_simple) -{ - constexpr size_t num_points = 128; - auto pippenger = Pippenger(GRUMPKIN_SRS_PATH, num_points / 2); - auto monomials = pippenger.get_point_table(); - - std::vector point_schedule(scalar_multiplication::point_table_size(num_points / 2)); - std::array bucket_empty_status; - - std::array transcript; - std::array transcript_points; - transcript_points[0] = 0x0; - transcript_points[1] = 0x2; - transcript_points[2] = 0x4; - transcript_points[3] = 0x6; - transcript_points[4] = 0xb; - transcript_points[5] = 0xc; - transcript_points[6] = 0xe; - transcript_points[7] = 0x11; - transcript_points[8] = 0x13; - transcript_points[9] = 0x14; - transcript_points[10] = 0x15; - transcript_points[11] = 0x16; - transcript_points[12] = 0x17; - transcript_points[13] = 0x18; - transcript_points[14] = 0x20; - transcript_points[15] = 0x21; - transcript_points[16] = 0x22; - transcript_points[17] = 0x27; - transcript_points[18] = 0x29; - transcript_points[19] = 0x2b; - transcript_points[20] = 0x2c; - transcript_points[21] = 0x2d; - transcript_points[22] = 0x2e; - transcript_points[23] = 0x36; - transcript_points[24] = 0x37; - transcript_points[25] = 0x38; - transcript_points[26] = 0x3e; - transcript_points[27] = 0x3f; - transcript_points[28] = 0x4e; - transcript_points[29] = 0x4f; - transcript_points[30] = 0x50; - transcript_points[31] = 0x51; - transcript_points[32] = 0x41; - transcript_points[33] = 0x52; - transcript_points[34] = 0x53; - transcript_points[35] = 0x54; - transcript_points[36] = 0x43; - transcript_points[37] = 0x57; - transcript_points[38] = 0x46; - transcript_points[39] = 0x58; - transcript_points[40] = 0x5b; - transcript_points[41] = 0x5e; - transcript_points[42] = 0x42; - transcript_points[43] = 0x47; - transcript_points[44] = 0x4b; - transcript_points[45] = 0x4d; - transcript_points[46] = 0x6b; - transcript_points[47] = 0x65; - transcript_points[48] = 0x6d; - transcript_points[49] = 0x67; - transcript_points[50] = 0x6f; - transcript_points[51] = 0x68; - transcript_points[52] = 0x69; - transcript_points[53] = 0x6a; - transcript_points[54] = 0x71; - transcript_points[55] = 0x72; - transcript_points[56] = 0x73; - transcript_points[57] = 0x74; - transcript_points[58] = 0x75; - transcript_points[59] = 0x66; - transcript_points[60] = 0x79; - transcript_points[62] = 0x7c; - transcript_points[61] = 0x7e; - transcript_points[63] = 0x7f; - transcript_points[64] = 0x1; - transcript_points[65] = 0x3; - transcript_points[66] = 0x5; - transcript_points[67] = 0x7; - transcript_points[68] = 0x8; - transcript_points[69] = 0x9; - transcript_points[70] = 0xa; - transcript_points[71] = 0xd; - transcript_points[72] = 0xf; - transcript_points[73] = 0x10; - transcript_points[74] = 0x12; - transcript_points[75] = 0x19; - transcript_points[76] = 0x1a; - transcript_points[77] = 0x1b; - transcript_points[78] = 0x1c; - transcript_points[79] = 0x1d; - transcript_points[80] = 0x1e; - transcript_points[81] = 0x1f; - transcript_points[82] = 0x23; - transcript_points[83] = 0x24; - transcript_points[84] = 0x25; - transcript_points[85] = 0x26; - transcript_points[86] = 0x28; - transcript_points[87] = 0x2a; - transcript_points[88] = 0x2f; - transcript_points[89] = 0x30; - transcript_points[90] = 0x31; - transcript_points[91] = 0x32; - transcript_points[92] = 0x33; - transcript_points[93] = 0x34; - transcript_points[94] = 0x35; - transcript_points[95] = 0x39; - transcript_points[96] = 0x3a; - transcript_points[97] = 0x3b; - transcript_points[98] = 0x3c; - transcript_points[99] = 0x3d; - transcript_points[100] = 0x48; - transcript_points[101] = 0x49; - transcript_points[102] = 0x55; - transcript_points[103] = 0x56; - transcript_points[104] = 0x4a; - transcript_points[105] = 0x44; - transcript_points[106] = 0x45; - transcript_points[107] = 0x40; - transcript_points[108] = 0x59; - transcript_points[109] = 0x5a; - transcript_points[110] = 0x5c; - transcript_points[111] = 0x5d; - transcript_points[112] = 0x5f; - transcript_points[113] = 0x60; - transcript_points[114] = 0x61; - transcript_points[115] = 0x62; - transcript_points[116] = 0x63; - transcript_points[117] = 0x4c; - transcript_points[118] = 0x6c; - transcript_points[119] = 0x6e; - transcript_points[120] = 0x64; - transcript_points[121] = 0x70; - transcript_points[122] = 0x77; - transcript_points[123] = 0x78; - transcript_points[124] = 0x76; - transcript_points[125] = 0x7a; - transcript_points[126] = 0x7b; - transcript_points[127] = 0x7d; - - for (size_t i = 0; i < 64; ++i) { - transcript[i] = 0; - transcript[i + 64] = 1; - } - for (size_t i = 0; i < num_points; ++i) { - point_schedule[i] = (static_cast(transcript_points[i]) << 32ULL) + transcript[i]; - } - std::array expected; - for (size_t i = 0; i < num_points; ++i) { - expected[i].self_set_infinity(); - } - - for (size_t i = 0; i < num_points; ++i) { - size_t schedule = transcript[i] & 0x7fffffffU; - { - expected[schedule] += monomials[static_cast(transcript_points[i])]; - } - } - - std::array point_pairs; - std::array output_buckets; - std::array scratch_space; - std::array bucket_counts; - std::array bit_offsets = { 0 }; - - scalar_multiplication::affine_product_runtime_state product_state{ - &monomials[0], &point_pairs[0], &output_buckets[0], - &scratch_space[0], &bucket_counts[0], &bit_offsets[0], - &point_schedule[0], num_points, 2, - &bucket_empty_status[0] - }; - - g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true); - - for (size_t i = 0; i < product_state.num_buckets; ++i) { - expected[i] = expected[i].normalize(); - EXPECT_EQ((output[i].x == expected[i].x), true); - EXPECT_EQ((output[i].y == expected[i].y), true); - } -} - -TEST(grumpkin_scalar_multiplication, reduce_buckets) -{ - constexpr size_t num_initial_points = 1 << 12; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::affine_element* scratch_points = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::affine_element* point_pairs = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points * 2))); - g1::element* expected_buckets = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points * 2))); - bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points * 2))); - - memset((void*)scratch_points, 0x00, (num_points * 2) * sizeof(g1::affine_element)); - memset((void*)point_pairs, 0x00, (num_points * 2) * sizeof(g1::affine_element)); - memset((void*)expected_buckets, 0x00, (num_points * 2) * sizeof(g1::element)); - memset((void*)bucket_empty_status, 0x00, (num_points * 2) * sizeof(bool)); - - fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points))); - - memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); - - // WORKTODO: unify by using 0 g2 elts - grumpkin::io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); - - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - for (size_t i = 0; i < num_initial_points; ++i) { - scalars[i] = fr::random_element(); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * 100 * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - - uint64_t* point_schedule_copy = static_cast(aligned_alloc(64, sizeof(uint64_t) * num_points * 2)); - for (size_t i = 0; i < num_points; ++i) { - state.point_schedule[i + num_points] = state.point_schedule[i + num_points] & 0xffffffff7fffffffUL; - // printf("state.point_schedule[%lu] = %lx \n", i, state.point_schedule[i]); - point_schedule_copy[i] = state.point_schedule[i + num_points]; - } - const size_t first_bucket = point_schedule_copy[0] & 0x7fffffffULL; - const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - &state.point_schedule[num_points], - num_points, - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - for (size_t i = 0; i < num_points; ++i) { - expected_buckets[i].self_set_infinity(); - } - for (size_t i = 0; i < num_points; ++i) { - uint64_t schedule = point_schedule_copy[i]; - uint64_t bucket_index = schedule & 0x7fffffffU; - uint64_t point_index = schedule >> 32ULL; - uint64_t predicate = (schedule >> 31ULL) & 1ULL; - // printf("expected bucket index = %lu \n", bucket_index - first_bucket); - g1::element& bucket = expected_buckets[bucket_index - first_bucket]; - g1::affine_element& point = monomials[point_index]; - bucket.self_mixed_add_or_sub(point, predicate); - } - - size_t it = 0; - - g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true); - - printf("num buckets = %zu \n", num_buckets); - for (size_t i = 0; i < num_buckets; ++i) { - if (!bucket_empty_status[i]) { - g1::element expected = expected_buckets[i].normalize(); - EXPECT_EQ((expected.x == result_buckets[it].x), true); - EXPECT_EQ((expected.y == result_buckets[it].y), true); - ++it; - } else { - printf("recorded empty bucket???\n"); - } - } - aligned_free(bucket_empty_status); - aligned_free(expected_buckets); - aligned_free(point_schedule_copy); - aligned_free(point_pairs); - aligned_free(scratch_points); - aligned_free(scratch_field); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -// This test intermittenly fails. -TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) -{ - constexpr size_t num_initial_points = 1 << 20; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - g1::affine_element* scratch_points = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - g1::affine_element* point_pairs = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - bool* bucket_empty_status = (bool*)(aligned_alloc(64, sizeof(bool) * (num_points))); - - fq* scratch_field = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points))); - - memset((void*)scratch_points, 0x00, num_points * sizeof(g1::affine_element)); - memset((void*)point_pairs, 0x00, num_points * sizeof(g1::affine_element)); - memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); - memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool)); - - io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < num_initial_points; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL; - const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - (uint32_t)state.round_counts[0], - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::reduce_buckets(product_state, true); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - aligned_free(bucket_empty_status); - aligned_free(point_pairs); - aligned_free(scratch_points); - aligned_free(scratch_field); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -TEST(grumpkin_scalar_multiplication, add_affine_points) -{ - constexpr size_t num_points = 20; - g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - fq* scratch_space = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2))); - fq* lambda = (fq*)(aligned_alloc(64, sizeof(fq) * (num_points * 2))); - - g1::element* points_copy = (g1::element*)(aligned_alloc(64, sizeof(g1::element) * (num_points))); - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - points_copy[i].x = points[i].x; - points_copy[i].y = points[i].y; - points_copy[i].z = fq::one(); - } - - size_t count = num_points - 1; - for (size_t i = num_points - 2; i < num_points; i -= 2) { - points_copy[count--] = points_copy[i] + points_copy[i + 1]; - points_copy[count + 1] = points_copy[count + 1].normalize(); - } - - scalar_multiplication::add_affine_points(points, num_points, scratch_space); - for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) { - EXPECT_EQ((points[i].x == points_copy[i].x), true); - EXPECT_EQ((points[i].y == points_copy[i].y), true); - } - aligned_free(lambda); - aligned_free(points); - aligned_free(points_copy); - aligned_free(scratch_space); -} - -TEST(grumpkin_scalar_multiplication, construct_addition_chains) -{ - constexpr size_t num_initial_points = 1 << 20; - constexpr size_t num_points = num_initial_points * 2; - g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - - io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < num_initial_points; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); - - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); - std::cout << "wnaf time: " << diff.count() << "ms" << std::endl; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, num_points); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "organize bucket time: " << diff.count() << "ms" << std::endl; - const size_t max_num_buckets = scalar_multiplication::get_num_buckets(num_points * 2); - bool* bucket_empty_status = static_cast(aligned_alloc(64, num_points * sizeof(bool))); - uint32_t* bucket_counts = static_cast(aligned_alloc(64, max_num_buckets * sizeof(uint32_t))); - memset((void*)bucket_counts, 0x00, max_num_buckets * sizeof(uint32_t)); - std::array bit_offsets = { 0 }; - const size_t first_bucket = state.point_schedule[0] & 0x7fffffffULL; - const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL; - const size_t num_buckets = last_bucket - first_bucket + 1; - - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - monomials, - monomials, - nullptr, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - num_points, - static_cast(num_buckets), - bucket_empty_status }; - - start = std::chrono::steady_clock::now(); - scalar_multiplication::construct_addition_chains(product_state, true); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); - end = std::chrono::steady_clock::now(); - diff = std::chrono::duration_cast(end - start); - std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; - - aligned_free(bucket_empty_status); - aligned_free(scalars); - aligned_free(monomials); - aligned_free(bucket_counts); -} - -TEST(grumpkin_scalar_multiplication, endomorphism_split) -{ - fr scalar = fr::random_element(); - - g1::element expected = g1::one * scalar; - - // we want to test that we can split a scalar into two half-length components, using the same location in memory. - fr* k1_t = &scalar; - fr* k2_t = (fr*)&scalar.data[2]; - - fr::split_into_endomorphism_scalars(scalar, *k1_t, *k2_t); - // The compiler really doesn't like what we're doing here, - // and disabling the array-bounds error project-wide seems unsafe. - // The large macro blocks are here to warn that we should be careful when - // aliasing the arguments to split_into_endomorphism_scalars -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Warray-bounds" -#endif - fr k1{ (*k1_t).data[0], (*k1_t).data[1], 0, 0 }; - fr k2{ (*k2_t).data[0], (*k2_t).data[1], 0, 0 }; -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic pop -#endif - g1::element result; - g1::element t1 = g1::affine_one * k1; - g1::affine_element generator = g1::affine_one; - fq beta = fq::cube_root_of_unity(); - generator.x = generator.x * beta; - generator.y = -generator.y; - g1::element t2 = generator * k2; - result = t1 + t2; - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, radix_sort) -{ - // check that our radix sort correctly sorts! - constexpr size_t target_degree = 1 << 8; - constexpr size_t num_rounds = scalar_multiplication::get_num_rounds(target_degree * 2); - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree)); - - fr source_scalar = fr::random_element(); - for (size_t i = 0; i < target_degree; ++i) { - source_scalar.self_sqr(); - fr::__copy(source_scalar, scalars[i]); - } - - scalar_multiplication::pippenger_runtime_state state(target_degree); - scalar_multiplication::compute_wnaf_states( - state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree); - - uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds)); - memcpy((void*)wnaf_copy, (void*)state.point_schedule, sizeof(uint64_t) * target_degree * 2 * num_rounds); - - scalar_multiplication::organize_buckets(state.point_schedule, state.round_counts, target_degree * 2); - for (size_t i = 0; i < num_rounds; ++i) { - uint64_t* unsorted_wnaf = &wnaf_copy[i * target_degree * 2]; - uint64_t* sorted_wnaf = &state.point_schedule[i * target_degree * 2]; - - const auto find_entry = [unsorted_wnaf, num_entries = target_degree * 2](auto x) { - for (size_t k = 0; k < num_entries; ++k) { - if (unsorted_wnaf[k] == x) { - return true; - } - } - return false; - }; - for (size_t j = 0; j < target_degree * 2; ++j) { - EXPECT_EQ(find_entry(sorted_wnaf[j]), true); - if (j > 0) { - EXPECT_EQ((sorted_wnaf[j] & 0x7fffffffU) >= (sorted_wnaf[j - 1] & 0x7fffffffU), true); - } - } - } - - free(scalars); - free(wnaf_copy); -} - -HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs) -{ - // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps. - // Check that this is done correctly - size_t transcript_degree = 1 << 20; - size_t target_degree = 1200000; - g1::affine_element* monomials = - (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree))); - io::read_transcript(monomials, transcript_degree, GRUMPKIN_SRS_PATH); - - memcpy((void*)(monomials + (2 * transcript_degree)), - (void*)monomials, - ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element))); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree); - - fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree)); - - fr source_scalar = fr::random_element(); - fr accumulator = source_scalar; - for (size_t i = 0; i < target_degree; ++i) { - accumulator *= source_scalar; - fr::__copy(accumulator, scalars[i]); - } - scalar_multiplication::pippenger_runtime_state state(target_degree); - - g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state); - first = first.normalize(); - - for (size_t i = 0; i < target_degree; ++i) { - scalars[i].self_neg(); - } - scalar_multiplication::pippenger_runtime_state state_2(target_degree); - - g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2); - second = second.normalize(); - - EXPECT_EQ((first.z == second.z), true); - EXPECT_EQ((first.z == fq::one()), true); - EXPECT_EQ((first.x == second.x), true); - EXPECT_EQ((first.y == -second.y), true); - - aligned_free(monomials); - aligned_free(scalars); -} - -TEST(grumpkin_scalar_multiplication, undersized_inputs) -{ - // we fall back to traditional scalar multiplication algorithm for small input sizes. - // Check this is done correctly - size_t num_points = 17; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl) -{ - constexpr size_t num_points = 128; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - g1::affine_element point = g1::affine_element(g1::element::random_element()); - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = point; - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - if (!expected.is_point_at_infinity()) { - expected = expected.normalize(); - } - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_short_inputs) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = scalar_multiplication::point_table_alloc(num_points); - - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - } - for (size_t i = 0; i < (num_points / 4); ++i) { - scalars[i * 4].data[0] = engine.get_random_uint32(); - scalars[i * 4].data[1] = engine.get_random_uint32(); - scalars[i * 4].data[2] = engine.get_random_uint32(); - scalars[i * 4].data[3] = engine.get_random_uint32(); - scalars[i * 4] = scalars[i * 4].to_montgomery_form(); - scalars[i * 4 + 1].data[0] = 0; - scalars[i * 4 + 1].data[1] = 0; - scalars[i * 4 + 1].data[2] = 0; - scalars[i * 4 + 1].data[3] = 0; - scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form(); - scalars[i * 4 + 2].data[0] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[1] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[2] = 0; - scalars[i * 4 + 2].data[3] = 0; - scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form(); - scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL); - scalars[i * 4 + 3].data[1] = 0; - scalars[i * 4 + 3].data[2] = 0; - scalars[i * 4 + 3].data[3] = 0; - scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form(); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_unsafe) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = scalar_multiplication::point_table_alloc(num_points); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs) -{ - constexpr size_t num_points = 8192; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * num_points); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - points[i] = g1::affine_element(g1::element::random_element()); - } - for (size_t i = 0; i < (num_points / 4); ++i) { - scalars[i * 4].data[0] = engine.get_random_uint32(); - scalars[i * 4].data[1] = engine.get_random_uint32(); - scalars[i * 4].data[2] = engine.get_random_uint32(); - scalars[i * 4].data[3] = engine.get_random_uint32(); - scalars[i * 4] = scalars[i * 4].to_montgomery_form(); - scalars[i * 4 + 1].data[0] = 0; - scalars[i * 4 + 1].data[1] = 0; - scalars[i * 4 + 1].data[2] = 0; - scalars[i * 4 + 1].data[3] = 0; - scalars[i * 4 + 1] = scalars[i * 4 + 1].to_montgomery_form(); - scalars[i * 4 + 2].data[0] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[1] = engine.get_random_uint32(); - scalars[i * 4 + 2].data[2] = 0; - scalars[i * 4 + 2].data[3] = 0; - scalars[i * 4 + 2] = scalars[i * 4 + 2].to_montgomery_form(); - scalars[i * 4 + 3].data[0] = (engine.get_random_uint32() & 0x07ULL); - scalars[i * 4 + 3].data[1] = 0; - scalars[i * 4 + 3].data[2] = 0; - scalars[i * 4 + 3].data[3] = 0; - scalars[i * 4 + 3] = scalars[i * 4 + 3].to_montgomery_form(); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_one) -{ - size_t num_points = 1; - - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr) * 1); - - g1::affine_element* points = - (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * num_points * 2 + 1); - - for (size_t i = 0; i < num_points; ++i) { - scalars[i] = fr::random_element(); - points[i] = g1::affine_element(g1::element::random_element()); - } - - g1::element expected; - expected.self_set_infinity(); - for (size_t i = 0; i < num_points; ++i) { - g1::element temp = points[i] * scalars[i]; - expected += temp; - } - expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); - result = result.normalize(); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result == expected, true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_zero_points) -{ - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); - - g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1); - - scalar_multiplication::pippenger_runtime_state state(0); - g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result.is_point_at_infinity(), true); -} - -TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero) -{ - fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); - - g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1); - - scalars[0] = fr::zero(); - points[0] = g1::affine_one; - scalar_multiplication::generate_pippenger_point_table(points, points, 1); - - scalar_multiplication::pippenger_runtime_state state(1); - g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state); - - aligned_free(scalars); - aligned_free(points); - - EXPECT_EQ(result.is_point_at_infinity(), true); -} diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp deleted file mode 100644 index b490f4973d..0000000000 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// #include "scalar_multiplication.hpp" -// #include "pippenger.hpp" -// #include "barretenberg/common/mem.hpp" - -// using namespace barretenberg; - -// #define WASM_EXPORT __attribute__((visibility("default"))) - -// extern "C" { - -// WASM_EXPORT void* bbmalloc(size_t size) -// { -// auto ptr = aligned_alloc(64, size); -// return ptr; -// } - -// WASM_EXPORT void bbfree(void* ptr) -// { -// aligned_free(ptr); -// } - -// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points) -// { -// auto ptr = new scalar_multiplication::Pippenger(points, num_points); -// return ptr; -// } - -// WASM_EXPORT void delete_pippenger(void* pippenger) -// { -// delete reinterpret_cast(pippenger); -// } - -// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* -// result_ptr) -// { -// scalar_multiplication::pippenger_runtime_state state(range); -// auto pippenger = reinterpret_cast(pippenger_ptr); -// auto scalars = reinterpret_cast(scalars_ptr); -// auto result = reinterpret_cast(result_ptr); -// *result = pippenger->pippenger_unsafe(scalars, from, range); -// } - -// WASM_EXPORT void g1_sum(void* points_ptr, const size_t num_points, void* result_ptr) -// { -// auto points = reinterpret_cast(points_ptr); -// auto result = reinterpret_cast(result_ptr); -// result->self_set_infinity(); -// *result = std::accumulate(points, points + num_points, *result); -// } -// } diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp deleted file mode 100644 index 60af544690..0000000000 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/c_bind.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// #include -// #include "../g1.hpp" - -// #define WASM_EXPORT __attribute__((visibility("default"))) - -// extern "C" { - -// WASM_EXPORT void* bbmalloc(size_t size); - -// WASM_EXPORT void bbfree(void* ptr); - -// WASM_EXPORT void* new_pippenger(uint8_t* points, size_t num_points); - -// WASM_EXPORT void delete_pippenger(void* pippenger); - -// WASM_EXPORT void pippenger_unsafe(void* pippenger_ptr, void* scalars_ptr, size_t from, size_t range, void* -// result_ptr); WASM_EXPORT void g1_sum(void* points_ptr, size_t num_points, void* result_ptr); -// } diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp index 08c6b62960..c32ced45c9 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.cpp @@ -1,44 +1,52 @@ #include "pippenger.hpp" #include "barretenberg/srs/io.hpp" -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { -Pippenger::Pippenger(g1::affine_element* points, size_t num_points) +template +Pippenger::Pippenger(typename Curve::AffineElement* points, size_t num_points) : monomials_(points) , num_points_(num_points) { - grumpkin::io::byteswap(&monomials_[0], num_points * 64); - scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); + io::byteswap(&monomials_[0], num_points * 64); + scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); } -Pippenger::Pippenger(uint8_t const* points, size_t num_points) +template +Pippenger::Pippenger(uint8_t const* points, size_t num_points) : num_points_(num_points) { monomials_ = point_table_alloc(num_points); - grumpkin::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64); - grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); + barretenberg::io::read_g1_elements_from_buffer(&monomials_[0], (char*)points, num_points * 64); + scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); } -Pippenger::Pippenger(std::string const& path, size_t num_points) +template +Pippenger::Pippenger(std::string const& path, size_t num_points) : num_points_(num_points) { monomials_ = point_table_alloc(num_points); - grumpkin::io::read_transcript_g1(monomials_, num_points, path); - grumpkin::scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); + barretenberg::io::read_transcript_g1(monomials_, num_points, path); + scalar_multiplication::generate_pippenger_point_table(monomials_, monomials_, num_points); } -g1::element Pippenger::pippenger_unsafe(fr* scalars, size_t from, size_t range) +template +typename Curve::Element Pippenger::pippenger_unsafe(typename Curve::ScalarField* scalars, + size_t from, + size_t range) { - scalar_multiplication::pippenger_runtime_state state(range); - return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state); + scalar_multiplication::pippenger_runtime_state state(range); + return scalar_multiplication::pippenger_unsafe(scalars, monomials_ + from * 2, range, state); } -Pippenger::~Pippenger() +template Pippenger::~Pippenger() { free(monomials_); } +template class Pippenger; + } // namespace scalar_multiplication -} // namespace grumpkin +} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp index a6c55f1bc4..f9a9e1e797 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp @@ -7,7 +7,7 @@ #include #endif -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { inline size_t point_table_size(size_t num_points) @@ -32,14 +32,17 @@ template inline T* point_table_alloc(size_t num_points) return (T*)aligned_alloc(64, point_table_buf_size(num_points)); } -class Pippenger { +template class Pippenger { public: + using ScalarField = typename Curve::ScalarField; + using Element = typename Curve::Element; + using AffineElement = typename Curve::AffineElement; /** * Expects points to be buffer of size as per point_table_size(). * It expects the crs to start at points[1], and it fills in affine_one at points[0]. * The crs undergoes a byteswap, and then the point table is generated. */ - Pippenger(g1::affine_element* points, size_t num_points); + Pippenger(AffineElement* points, size_t num_points); Pippenger(uint8_t const* points, size_t num_points); @@ -47,16 +50,18 @@ class Pippenger { ~Pippenger(); - g1::element pippenger_unsafe(fr* scalars, size_t from, size_t range); + Element pippenger_unsafe(ScalarField* scalars, size_t from, size_t range); - g1::affine_element* get_point_table() const { return monomials_; } + AffineElement* get_point_table() const { return monomials_; } size_t get_num_points() const { return num_points_; } private: - g1::affine_element* monomials_; + AffineElement* monomials_; size_t num_points_; }; +extern template class Pippenger; + } // namespace scalar_multiplication -} // namespace grumpkin +} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp index f56bdaa936..01f92b8673 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.cpp @@ -2,7 +2,7 @@ #include -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept { @@ -61,4 +61,4 @@ void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uin radix_sort(wnaf_entries, num_entries, shift); } } // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file +} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp index d4ef31da06..bde5916663 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/process_buckets.hpp @@ -3,10 +3,10 @@ #include #include -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { void radix_sort(uint64_t* keys, const size_t num_entries, const uint32_t shift) noexcept; void process_buckets(uint64_t* wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept; } // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file +} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp index 36d894eafa..9e0ba1fc03 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.cpp @@ -8,16 +8,19 @@ #include #endif -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { -pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points) +template pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points) { + using Fq = typename Curve::BaseField; + using AffineElement = typename Curve::AffineElement; + constexpr size_t MAX_NUM_ROUNDS = 256; num_points = num_initial_points * 2; const size_t num_points_floor = static_cast(1ULL << (numeric::get_msb(num_points))); const size_t num_buckets = static_cast( - 1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast(num_initial_points))); + 1U << scalar_multiplication::get_optimal_bucket_width(static_cast(num_initial_points))); #ifndef NO_MULTITHREADING const size_t num_threads = max_threads::compute_num_threads(); #else @@ -25,15 +28,15 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points #endif const size_t prefetch_overflow = 16 * num_threads; const size_t num_rounds = - static_cast(grumpkin::scalar_multiplication::get_num_rounds(static_cast(num_points_floor))); + static_cast(scalar_multiplication::get_num_rounds(static_cast(num_points_floor))); point_schedule = (uint64_t*)(aligned_alloc( 64, (static_cast(num_points) * num_rounds + prefetch_overflow) * sizeof(uint64_t))); skew_table = (bool*)(aligned_alloc(64, pad(static_cast(num_points) * sizeof(bool), 64))); - point_pairs_1 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - point_pairs_2 = (g1::affine_element*)(aligned_alloc( - 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(g1::affine_element))); - scratch_space = (fq*)(aligned_alloc(64, static_cast(num_points) * sizeof(g1::affine_element))); + point_pairs_1 = (AffineElement*)(aligned_alloc( + 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(AffineElement))); + point_pairs_2 = (AffineElement*)(aligned_alloc( + 64, (static_cast(num_points) * 2 + (num_threads * 16)) * sizeof(AffineElement))); + scratch_space = (Fq*)(aligned_alloc(64, static_cast(num_points) * sizeof(AffineElement))); bucket_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); bit_counts = (uint32_t*)(aligned_alloc(64, num_threads * num_buckets * sizeof(uint32_t))); bucket_empty_status = (bool*)(aligned_alloc(64, num_threads * num_buckets * sizeof(bool))); @@ -45,13 +48,9 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points #endif for (size_t i = 0; i < num_threads; ++i) { const size_t thread_offset = i * points_per_thread; - memset((void*)(point_pairs_1 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(point_pairs_2 + thread_offset + (i * 16)), - 0, - (points_per_thread + 16) * sizeof(g1::affine_element)); - memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(fq)); + memset((void*)(point_pairs_1 + thread_offset + (i * 16)), 0, (points_per_thread + 16) * sizeof(AffineElement)); + memset((void*)(point_pairs_2 + thread_offset + (i * 16)), 0, (points_per_thread + 16) * sizeof(AffineElement)); + memset((void*)(scratch_space + thread_offset), 0, (points_per_thread) * sizeof(Fq)); for (size_t j = 0; j < num_rounds; ++j) { const size_t round_offset = (j * static_cast(num_points)); memset((void*)(point_schedule + round_offset + thread_offset), 0, points_per_thread * sizeof(uint64_t)); @@ -65,7 +64,7 @@ pippenger_runtime_state::pippenger_runtime_state(const size_t num_initial_points memset((void*)round_counts, 0, MAX_NUM_ROUNDS * sizeof(uint64_t)); } -pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other) +template pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other) { point_schedule = other.point_schedule; skew_table = other.skew_table; @@ -90,7 +89,8 @@ pippenger_runtime_state::pippenger_runtime_state(pippenger_runtime_state&& other num_points = other.num_points; } -pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other) +template +pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_state&& other) { if (point_schedule) { aligned_free(point_schedule); @@ -152,14 +152,15 @@ pippenger_runtime_state& pippenger_runtime_state::operator=(pippenger_runtime_st return *this; } -affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state(const size_t num_threads, - const size_t thread_index) +template +affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime_state( + const size_t num_threads, const size_t thread_index) { const size_t points_per_thread = static_cast(num_points / num_threads); - const size_t num_buckets = static_cast( - 1U << grumpkin::scalar_multiplication::get_optimal_bucket_width(static_cast(num_points) / 2)); + const size_t num_buckets = + static_cast(1U << scalar_multiplication::get_optimal_bucket_width(static_cast(num_points) / 2)); - scalar_multiplication::affine_product_runtime_state product_state; + scalar_multiplication::affine_product_runtime_state product_state; product_state.point_pairs_1 = point_pairs_1 + (thread_index * points_per_thread) + (thread_index * 16); product_state.point_pairs_2 = point_pairs_2 + (thread_index * points_per_thread) + (thread_index * 16); @@ -170,7 +171,7 @@ affine_product_runtime_state pippenger_runtime_state::get_affine_product_runtime return product_state; } -pippenger_runtime_state::~pippenger_runtime_state() +template pippenger_runtime_state::~pippenger_runtime_state() { if (point_schedule) { aligned_free(point_schedule); @@ -208,5 +209,10 @@ pippenger_runtime_state::~pippenger_runtime_state() aligned_free(round_counts); } } + +template struct affine_product_runtime_state; +template struct affine_product_runtime_state; +template struct pippenger_runtime_state; +template struct pippenger_runtime_state; } // namespace scalar_multiplication -} // namespace grumpkin +} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp index 050c955c8c..b0102485f4 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp @@ -1,9 +1,11 @@ #pragma once -#include "../grumpkin.hpp" +// #include "../g1.hpp" +#include "../grumpkin/grumpkin.hpp" +#include "../bn254/bn254.hpp" #include "barretenberg/ecc/groups/wnaf.hpp" -namespace grumpkin { +namespace barretenberg { // simple helper functions to retrieve pointers to pre-allocated memory for the scalar multiplication algorithm. // This is to eliminate page faults when allocating (and writing) to large tranches of memory. namespace scalar_multiplication { @@ -63,12 +65,11 @@ constexpr size_t get_num_rounds(const size_t num_points) return WNAF_SIZE(bits_per_bucket + 1); } -// WORKTODO: uniformize -struct affine_product_runtime_state { - g1::affine_element* points; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; +template struct affine_product_runtime_state { + typename Curve::AffineElement* points; + typename Curve::AffineElement* point_pairs_1; + typename Curve::AffineElement* point_pairs_2; + typename Curve::BaseField* scratch_space; uint32_t* bucket_counts; uint32_t* bit_offsets; uint64_t* point_schedule; @@ -77,12 +78,12 @@ struct affine_product_runtime_state { bool* bucket_empty_status; }; -struct pippenger_runtime_state { +template struct pippenger_runtime_state { uint64_t* point_schedule; bool* skew_table; - g1::affine_element* point_pairs_1; - g1::affine_element* point_pairs_2; - fq* scratch_space; + typename Curve::AffineElement* point_pairs_1; + typename Curve::AffineElement* point_pairs_2; + typename Curve::BaseField* scratch_space; uint32_t* bucket_counts; uint32_t* bit_counts; bool* bucket_empty_status; @@ -94,7 +95,13 @@ struct pippenger_runtime_state { pippenger_runtime_state& operator=(pippenger_runtime_state&& other); ~pippenger_runtime_state(); - affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, const size_t thread_index); + affine_product_runtime_state get_affine_product_runtime_state(const size_t num_threads, + const size_t thread_index); }; + +extern template struct affine_product_runtime_state; +extern template struct affine_product_runtime_state; +extern template struct pippenger_runtime_state; +extern template struct pippenger_runtime_state; } // namespace scalar_multiplication -} // namespace grumpkin \ No newline at end of file +} // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp index d61158b143..b66e775337 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.cpp @@ -10,11 +10,7 @@ #include #include -#include "../../../groups/wnaf.hpp" -#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" -#include "barretenberg/ecc/curves/bn254/fq.hpp" -#include "barretenberg/ecc/curves/bn254/fr.hpp" -#include "barretenberg/ecc/curves/bn254/g1.hpp" +#include "../../groups/wnaf.hpp" #include "./process_buckets.hpp" #include "./runtime_states.hpp" @@ -57,55 +53,59 @@ uint64_t schedule_o = state.point_schedule[schedule_it + 14]; \ uint64_t schedule_p = state.point_schedule[schedule_it + 15]; \ \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_a >> 32ULL), state.point_pairs_1 + current_offset, (schedule_a >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_b >> 32ULL), state.point_pairs_1 + current_offset + 1, (schedule_b >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_c >> 32ULL), state.point_pairs_1 + current_offset + 2, (schedule_c >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_d >> 32ULL), state.point_pairs_1 + current_offset + 3, (schedule_d >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_e >> 32ULL), state.point_pairs_1 + current_offset + 4, (schedule_e >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_f >> 32ULL), state.point_pairs_1 + current_offset + 5, (schedule_f >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_g >> 32ULL), state.point_pairs_1 + current_offset + 6, (schedule_g >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_h >> 32ULL), state.point_pairs_1 + current_offset + 7, (schedule_h >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_i >> 32ULL), state.point_pairs_1 + current_offset + 8, (schedule_i >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine( \ + Group::conditional_negate_affine( \ state.points + (schedule_j >> 32ULL), state.point_pairs_1 + current_offset + 9, (schedule_j >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_k >> 32ULL), \ - state.point_pairs_1 + current_offset + 10, \ - (schedule_k >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_l >> 32ULL), \ - state.point_pairs_1 + current_offset + 11, \ - (schedule_l >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_m >> 32ULL), \ - state.point_pairs_1 + current_offset + 12, \ - (schedule_m >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_n >> 32ULL), \ - state.point_pairs_1 + current_offset + 13, \ - (schedule_n >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_o >> 32ULL), \ - state.point_pairs_1 + current_offset + 14, \ - (schedule_o >> 31ULL) & 1ULL); \ - g1::conditional_negate_affine(state.points + (schedule_p >> 32ULL), \ - state.point_pairs_1 + current_offset + 15, \ - (schedule_p >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_k >> 32ULL), \ + state.point_pairs_1 + current_offset + 10, \ + (schedule_k >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_l >> 32ULL), \ + state.point_pairs_1 + current_offset + 11, \ + (schedule_l >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_m >> 32ULL), \ + state.point_pairs_1 + current_offset + 12, \ + (schedule_m >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_n >> 32ULL), \ + state.point_pairs_1 + current_offset + 13, \ + (schedule_n >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_o >> 32ULL), \ + state.point_pairs_1 + current_offset + 14, \ + (schedule_o >> 31ULL) & 1ULL); \ + Group::conditional_negate_affine(state.points + (schedule_p >> 32ULL), \ + state.point_pairs_1 + current_offset + 15, \ + (schedule_p >> 31ULL) & 1ULL); \ \ current_offset += 16; \ schedule_it += 16; -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points) +template +void generate_pippenger_point_table(typename Curve::AffineElement* points, + typename Curve::AffineElement* table, + size_t num_points) { // iterate backwards, so that `points` and `table` can point to the same memory location - fq beta = fq::cube_root_of_unity(); + using Fq = typename Curve::BaseField; + Fq beta = Fq::cube_root_of_unity(); for (size_t i = num_points - 1; i < num_points; --i) { table[i * 2] = points[i]; table[i * 2 + 1].x = beta * points[i].x; @@ -193,12 +193,14 @@ void generate_pippenger_point_table(g1::affine_element* points, g1::affine_eleme * @param scalars The pointer to the region with initial scalars that need to be converted into WNAF * @param num_initial_points The number of points before the endomorphism split **/ +template void compute_wnaf_states(uint64_t* point_schedule, bool* input_skew_table, uint64_t* round_counts, - const fr* scalars, + const typename Curve::ScalarField* scalars, const size_t num_initial_points) { + using Fr = typename Curve::ScalarField; const size_t num_points = num_initial_points * 2; constexpr size_t MAX_NUM_ROUNDS = 256; constexpr size_t MAX_NUM_THREADS = 128; @@ -222,30 +224,30 @@ void compute_wnaf_states(uint64_t* point_schedule, #pragma omp parallel for #endif for (size_t i = 0; i < num_threads; ++i) { - fr T0; + Fr T0; uint64_t* wnaf_table = &point_schedule[(2 * i) * num_initial_points_per_thread]; - const fr* thread_scalars = &scalars[i * num_initial_points_per_thread]; + const Fr* thread_scalars = &scalars[i * num_initial_points_per_thread]; bool* skew_table = &input_skew_table[(2 * i) * num_initial_points_per_thread]; uint64_t offset = i * num_points_per_thread; for (uint64_t j = 0; j < num_initial_points_per_thread; ++j) { T0 = thread_scalars[j].from_montgomery_form(); - fr::split_into_endomorphism_scalars(T0, T0, *(fr*)&T0.data[2]); - - barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[0], - &wnaf_table[(j << 1UL)], - skew_table[j << 1ULL], - &thread_round_counts[i][0], - ((j << 1ULL) + offset) << 32ULL, - num_points, - wnaf_bits); - barretenberg::wnaf::fixed_wnaf_with_counts(&T0.data[2], - &wnaf_table[(j << 1UL) + 1], - skew_table[(j << 1UL) + 1], - &thread_round_counts[i][0], - ((j << 1UL) + offset + 1) << 32UL, - num_points, - wnaf_bits); + Fr::split_into_endomorphism_scalars(T0, T0, *(Fr*)&T0.data[2]); + + wnaf::fixed_wnaf_with_counts(&T0.data[0], + &wnaf_table[(j << 1UL)], + skew_table[j << 1ULL], + &thread_round_counts[i][0], + ((j << 1ULL) + offset) << 32ULL, + num_points, + wnaf_bits); + wnaf::fixed_wnaf_with_counts(&T0.data[2], + &wnaf_table[(j << 1UL) + 1], + skew_table[(j << 1UL) + 1], + &thread_round_counts[i][0], + ((j << 1UL) + offset + 1) << 32UL, + num_points, + wnaf_bits); } } @@ -310,9 +312,13 @@ void organize_buckets(uint64_t* point_schedule, const uint64_t*, const size_t nu * * We can re-arrange the Pippenger algorithm to get this property, but it's...complicated **/ -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space) +template +void add_affine_points(typename Curve::AffineElement* points, + const size_t num_points, + typename Curve::BaseField* scratch_space) { - fq batch_inversion_accumulator = fq::one(); + using Fq = typename Curve::BaseField; + Fq batch_inversion_accumulator = Fq::one(); for (size_t i = 0; i < num_points; i += 2) { scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1 @@ -347,9 +353,13 @@ void add_affine_points(g1::affine_element* points, const size_t num_points, fq* } } -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space) +template +void add_affine_points_with_edge_cases(typename Curve::AffineElement* points, + const size_t num_points, + typename Curve::BaseField* scratch_space) { - fq batch_inversion_accumulator = fq::one(); + using Fq = typename Curve::BaseField; + Fq batch_inversion_accumulator = Fq::one(); for (size_t i = 0; i < num_points; i += 2) { if (points[i].is_point_at_infinity() || points[i + 1].is_point_at_infinity()) { @@ -359,7 +369,7 @@ void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t if (points[i].y == points[i + 1].y) { // double scratch_space[i >> 1] = points[i].x + points[i].x; // 2x - fq x_squared = points[i].x.sqr(); + Fq x_squared = points[i].x.sqr(); points[i + 1].x = points[i].y + points[i].y; // 2y points[i + 1].y = x_squared + x_squared + x_squared; // 3x^2 points[i + 1].y *= batch_inversion_accumulator; @@ -415,7 +425,10 @@ void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t * `max_bucket_bits` indicates the largest set of nested pairs in the array, * which defines the iteration depth **/ -void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases) +template +void evaluate_addition_chains(affine_product_runtime_state& state, + const size_t max_bucket_bits, + bool handle_edge_cases) { size_t end = state.num_points; size_t start = 0; @@ -423,9 +436,9 @@ void evaluate_addition_chains(affine_product_runtime_state& state, const size_t const size_t points_in_round = (state.num_points - state.bit_offsets[i + 1]) >> (i); start = end - points_in_round; if (handle_edge_cases) { - add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space); + add_affine_points_with_edge_cases(state.point_pairs_1 + start, points_in_round, state.scratch_space); } else { - add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space); + add_affine_points(state.point_pairs_1 + start, points_in_round, state.scratch_space); } } } @@ -450,7 +463,10 @@ void evaluate_addition_chains(affine_product_runtime_state& state, const size_t * The next step is to 'play it again Sam', and recurse back into `reduce_buckets`, with our reduced number of points. * We repeat this process until every bucket only has one point assigned to it. **/ -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool first_round, bool handle_edge_cases) +template +typename Curve::AffineElement* reduce_buckets(affine_product_runtime_state& state, + bool first_round, + bool handle_edge_cases) { // std::chrono::steady_clock::time_point time_start = std::chrono::steady_clock::now(); @@ -516,7 +532,7 @@ g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool fir // modify `num_points` to reflect the new number of reduced points. // also swap around the `point_pairs` pointer; what used to be our temporary array // has now become our input point array - g1::affine_element* temp = state.point_pairs_1; + typename Curve::AffineElement* temp = state.point_pairs_1; state.num_points = new_num_points; state.points = state.point_pairs_1; state.point_pairs_1 = state.point_pairs_2; @@ -528,8 +544,10 @@ g1::affine_element* reduce_buckets(affine_product_runtime_state& state, bool fir return reduce_buckets(state, false, handle_edge_cases); } -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts) +template +uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts) { + using Group = typename Curve::Group; // if this is the first call to `construct_addition_chains`, we need to count up our buckets if (empty_bucket_counts) { memset((void*)state.bucket_counts, 0x00, sizeof(uint32_t) * state.num_buckets); @@ -615,30 +633,30 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp const uint64_t schedule_g = state.point_schedule[schedule_it + 6]; const uint64_t schedule_h = state.point_schedule[schedule_it + 7]; - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_e >> 32ULL), - state.point_pairs_1 + current_offset + 4, - (schedule_e >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_f >> 32ULL), - state.point_pairs_1 + current_offset + 5, - (schedule_f >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_g >> 32ULL), - state.point_pairs_1 + current_offset + 6, - (schedule_g >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_h >> 32ULL), - state.point_pairs_1 + current_offset + 7, - (schedule_h >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL), + state.point_pairs_1 + current_offset, + (schedule_a >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL), + state.point_pairs_1 + current_offset + 1, + (schedule_b >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_c >> 32ULL), + state.point_pairs_1 + current_offset + 2, + (schedule_c >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_d >> 32ULL), + state.point_pairs_1 + current_offset + 3, + (schedule_d >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_e >> 32ULL), + state.point_pairs_1 + current_offset + 4, + (schedule_e >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_f >> 32ULL), + state.point_pairs_1 + current_offset + 5, + (schedule_f >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_g >> 32ULL), + state.point_pairs_1 + current_offset + 6, + (schedule_g >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_h >> 32ULL), + state.point_pairs_1 + current_offset + 7, + (schedule_h >> 31ULL) & 1ULL); current_offset += 8; schedule_it += 8; @@ -654,18 +672,18 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp const uint64_t schedule_c = state.point_schedule[schedule_it + 2]; const uint64_t schedule_d = state.point_schedule[schedule_it + 3]; - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_c >> 32ULL), - state.point_pairs_1 + current_offset + 2, - (schedule_c >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_d >> 32ULL), - state.point_pairs_1 + current_offset + 3, - (schedule_d >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL), + state.point_pairs_1 + current_offset, + (schedule_a >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL), + state.point_pairs_1 + current_offset + 1, + (schedule_b >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_c >> 32ULL), + state.point_pairs_1 + current_offset + 2, + (schedule_c >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_d >> 32ULL), + state.point_pairs_1 + current_offset + 3, + (schedule_d >> 31ULL) & 1ULL); current_offset += 4; schedule_it += 4; break; @@ -678,12 +696,12 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp const uint64_t schedule_a = state.point_schedule[schedule_it]; const uint64_t schedule_b = state.point_schedule[schedule_it + 1]; - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); - g1::conditional_negate_affine(state.points + (schedule_b >> 32ULL), - state.point_pairs_1 + current_offset + 1, - (schedule_b >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL), + state.point_pairs_1 + current_offset, + (schedule_a >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_b >> 32ULL), + state.point_pairs_1 + current_offset + 1, + (schedule_b >> 31ULL) & 1ULL); current_offset += 2; schedule_it += 2; break; @@ -695,9 +713,9 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp __builtin_prefetch(state.points + (state.point_schedule[schedule_it + 7] >> 32ULL)); const uint64_t schedule_a = state.point_schedule[schedule_it]; - g1::conditional_negate_affine(state.points + (schedule_a >> 32ULL), - state.point_pairs_1 + current_offset, - (schedule_a >> 31ULL) & 1ULL); + Group::conditional_negate_affine(state.points + (schedule_a >> 32ULL), + state.point_pairs_1 + current_offset, + (schedule_a >> 31ULL) & 1ULL); ++current_offset; ++schedule_it; break; @@ -712,7 +730,7 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp const uint64_t predicate = (schedule >> 31UL) & 1UL; - g1::conditional_negate_affine( + Group::conditional_negate_affine( state.points + (schedule >> 32ULL), state.point_pairs_1 + current_offset, predicate); ++current_offset; ++schedule_it; @@ -724,11 +742,14 @@ uint32_t construct_addition_chains(affine_product_runtime_state& state, bool emp return max_bucket_bits; } -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases) +template +typename Curve::Element evaluate_pippenger_rounds(pippenger_runtime_state& state, + typename Curve::AffineElement* points, + const size_t num_points, + bool handle_edge_cases) { + using Element = typename Curve::Element; + using AffineElement = typename Curve::AffineElement; const size_t num_rounds = get_num_rounds(num_points); #ifndef NO_MULTITHREADING const size_t num_threads = max_threads::compute_num_threads(); @@ -737,8 +758,8 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, #endif const size_t bits_per_bucket = get_optimal_bucket_width(num_points / 2); - std::unique_ptr thread_accumulators( - static_cast(aligned_alloc(64, num_threads * sizeof(g1::element))), &aligned_free); + std::unique_ptr thread_accumulators( + static_cast(aligned_alloc(64, num_threads * sizeof(Element))), &aligned_free); #ifndef NO_MULTITHREADING #pragma omp parallel for @@ -750,7 +771,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, const uint64_t num_round_points = state.round_counts[i]; - g1::element accumulator; + Element accumulator; accumulator.self_set_infinity(); if ((num_round_points == 0) || (num_round_points < num_threads && j != num_threads - 1)) { @@ -767,13 +788,14 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, thread_point_schedule[(num_round_points_per_thread - 1 + leftovers)] & 0x7fffffffU; const size_t num_thread_buckets = (last_bucket - first_bucket) + 1; - affine_product_runtime_state product_state = state.get_affine_product_runtime_state(num_threads, j); + affine_product_runtime_state product_state = + state.get_affine_product_runtime_state(num_threads, j); product_state.num_points = static_cast(num_round_points_per_thread + leftovers); product_state.points = points; product_state.point_schedule = thread_point_schedule; product_state.num_buckets = static_cast(num_thread_buckets); - g1::affine_element* output_buckets = reduce_buckets(product_state, true, handle_edge_cases); - g1::element running_sum; + AffineElement* output_buckets = reduce_buckets(product_state, true, handle_edge_cases); + Element running_sum; running_sum.self_set_infinity(); // one nice side-effect of the affine trick, is that half of the bucket concatenation @@ -796,7 +818,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, if (first_bucket > 0) { uint32_t multiplier = static_cast(first_bucket << 1UL); size_t shift = numeric::get_msb(multiplier); - g1::element rolling_accumulator = g1::point_at_infinity; + Element rolling_accumulator = g1::point_at_infinity; bool init = false; while (shift != static_cast(-1)) { if (init) { @@ -817,8 +839,8 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, if (i == (num_rounds - 1)) { const size_t num_points_per_thread = num_points / num_threads; bool* skew_table = &state.skew_table[j * num_points_per_thread]; - g1::affine_element* point_table = &points[j * num_points_per_thread]; - g1::affine_element addition_temporary; + AffineElement* point_table = &points[j * num_points_per_thread]; + AffineElement addition_temporary; for (size_t k = 0; k < num_points_per_thread; ++k) { if (skew_table[k]) { addition_temporary = -point_table[k]; @@ -836,7 +858,7 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, } } - g1::element result; + Element result; result.self_set_infinity(); for (size_t i = 0; i < num_threads; ++i) { result += thread_accumulators[i]; @@ -844,25 +866,31 @@ g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, return result; } -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) +template +typename Curve::Element pippenger_internal(typename Curve::AffineElement* points, + typename Curve::ScalarField* scalars, + const size_t num_initial_points, + pippenger_runtime_state& state, + bool handle_edge_cases) { // multiplication_runtime_state state; - compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); + compute_wnaf_states(state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); organize_buckets(state.point_schedule, state.round_counts, num_initial_points * 2); - g1::element result = evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases); + typename Curve::Element result = + evaluate_pippenger_rounds(state, points, num_initial_points * 2, handle_edge_cases); return result; } -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases) +template +typename Curve::Element pippenger(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state, + bool handle_edge_cases) { + using Group = typename Curve::Group; + using Element = typename Curve::Element; + // our windowed non-adjacent form algorthm requires that each thread can work on at least 8 points. // If we fall below this theshold, fall back to the traditional scalar multiplication algorithm. // For 8 threads, this neatly coincides with the threshold where Strauss scalar multiplication outperforms Pippenger @@ -873,20 +901,20 @@ g1::element pippenger(fr* scalars, #endif if (num_initial_points == 0) { - g1::element out = g1::one; + Element out = Group::one; out.self_set_infinity(); return out; } if (num_initial_points <= threshold) { - std::vector exponentiation_results(num_initial_points); + std::vector exponentiation_results(num_initial_points); // might as well multithread this... // Possible optimization: use group::batch_mul_with_endomorphism here. #ifndef NO_MULTITHREADING #pragma omp parallel for #endif for (size_t i = 0; i < num_initial_points; ++i) { - exponentiation_results[i] = g1::element(points[i * 2]) * scalars[i]; + exponentiation_results[i] = Element(points[i * 2]) * scalars[i]; } for (size_t i = num_initial_points - 1; i > 0; --i) { @@ -898,7 +926,7 @@ g1::element pippenger(fr* scalars, const size_t slice_bits = static_cast(numeric::get_msb(static_cast(num_initial_points))); const size_t num_slice_points = static_cast(1ULL << slice_bits); - g1::element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases); + Element result = pippenger_internal(points, scalars, num_slice_points, state, handle_edge_cases); if (num_slice_points != num_initial_points) { const uint64_t leftover_points = num_initial_points - num_slice_points; @@ -927,21 +955,72 @@ g1::element pippenger(fr* scalars, * Unless you're a malicious adversary, then it would be a great idea! * **/ -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) +template +typename Curve::Element pippenger_unsafe(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state) { return pippenger(scalars, points, num_initial_points, state, false); } -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state) + +template +typename Curve::Element pippenger_without_endomorphism_basis_points(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state) { - std::vector G_mod(num_initial_points * 2); - grumpkin::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points); + std::vector G_mod(num_initial_points * 2); + barretenberg::scalar_multiplication::generate_pippenger_point_table(points, &G_mod[0], num_initial_points); return pippenger(scalars, &G_mod[0], num_initial_points, state, false); } + +// Explicit instantiation +template uint32_t construct_addition_chains(affine_product_runtime_state& state, + bool empty_bucket_counts = true); + +template void add_affine_points(curve::BN254::AffineElement* points, + const size_t num_points, + curve::BN254::BaseField* scratch_space); + +template void add_affine_points_with_edge_cases(curve::BN254::AffineElement* points, + const size_t num_points, + curve::BN254::BaseField* scratch_space); + +template void evaluate_addition_chains(affine_product_runtime_state& state, + const size_t max_bucket_bits, + bool handle_edge_cases); +template curve::BN254::Element pippenger_internal(curve::BN254::AffineElement* points, + curve::BN254::ScalarField* scalars, + const size_t num_initial_points, + pippenger_runtime_state& state, + bool handle_edge_cases); + +template curve::BN254::Element evaluate_pippenger_rounds(pippenger_runtime_state& state, + curve::BN254::AffineElement* points, + const size_t num_points, + bool handle_edge_cases = false); + +template curve::BN254::AffineElement* reduce_buckets(affine_product_runtime_state& state, + bool first_round = true, + bool handle_edge_cases = false); + +template curve::BN254::Element pippenger(curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_points, + pippenger_runtime_state& state, + bool handle_edge_cases = true); + +template curve::BN254::Element pippenger_unsafe(curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); + +template curve::BN254::Element pippenger_without_endomorphism_basis_points( + curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); + } // namespace scalar_multiplication -} // namespace grumpkin +} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp index a9a5c9c89d..eed65f8646 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp @@ -1,11 +1,12 @@ #pragma once -#include "../grumpkin.hpp" +#include "../bn254/bn254.hpp" +#include "../grumpkin/grumpkin.hpp" #include "./runtime_states.hpp" #include #include -namespace grumpkin { +namespace barretenberg { namespace scalar_multiplication { constexpr size_t get_num_buckets(const size_t num_points) @@ -79,18 +80,22 @@ constexpr size_t get_num_buckets(const size_t num_points) * **/ -struct multiplication_thread_state { - g1::element* buckets; +template struct multiplication_thread_state { + typename Curve::Element* buckets; const uint64_t* point_schedule; }; +template void compute_wnaf_states(uint64_t* point_schedule, bool* input_skew_table, uint64_t* round_counts, - const fr* scalars, + const typename Curve::ScalarField* scalars, const size_t num_initial_points); -void generate_pippenger_point_table(g1::affine_element* points, g1::affine_element* table, size_t num_points); +template +void generate_pippenger_point_table(typename Curve::AffineElement* points, + typename Curve::AffineElement* table, + size_t num_points); void organize_buckets(uint64_t* point_schedule, const uint64_t* round_counts, const size_t num_points); @@ -111,44 +116,106 @@ inline void count_bits(uint32_t* bucket_counts, } } -uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true); +template +uint32_t construct_addition_chains(affine_product_runtime_state& state, bool empty_bucket_counts = true); -void add_affine_points(g1::affine_element* points, const size_t num_points, fq* scratch_space); -void add_affine_points_with_edge_cases(g1::affine_element* points, const size_t num_points, fq* scratch_space); +template +void add_affine_points(typename Curve::AffineElement* points, + const size_t num_points, + typename Curve::BaseField* scratch_space); -void evaluate_addition_chains(affine_product_runtime_state& state, +template +void add_affine_points_with_edge_cases(typename Curve::AffineElement* points, + const size_t num_points, + typename Curve::BaseField* scratch_space); + +template +void evaluate_addition_chains(affine_product_runtime_state& state, const size_t max_bucket_bits, bool handle_edge_cases); - -g1::element pippenger_internal(g1::affine_element* points, - fr* scalars, - const size_t num_initial_points, - pippenger_runtime_state& state, - bool handle_edge_cases); - -g1::element evaluate_pippenger_rounds(pippenger_runtime_state& state, - g1::affine_element* points, - const size_t num_points, - bool handle_edge_cases = false); - -g1::affine_element* reduce_buckets(affine_product_runtime_state& state, - bool first_round = true, - bool handle_edge_cases = false); - -g1::element pippenger(fr* scalars, - g1::affine_element* points, - const size_t num_points, - pippenger_runtime_state& state, - bool handle_edge_cases = true); - -g1::element pippenger_unsafe(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); -g1::element pippenger_without_endomorphism_basis_points(fr* scalars, - g1::affine_element* points, - const size_t num_initial_points, - pippenger_runtime_state& state); +template +typename Curve::Element pippenger_internal(typename Curve::AffineElement* points, + typename Curve::ScalarField* scalars, + const size_t num_initial_points, + pippenger_runtime_state& state, + bool handle_edge_cases); + +template +typename Curve::Element evaluate_pippenger_rounds(pippenger_runtime_state& state, + typename Curve::AffineElement* points, + const size_t num_points, + bool handle_edge_cases = false); + +template +typename Curve::AffineElement* reduce_buckets(affine_product_runtime_state& state, + bool first_round = true, + bool handle_edge_cases = false); + +template +typename Curve::Element pippenger(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_points, + pippenger_runtime_state& state, + bool handle_edge_cases = true); + +template +typename Curve::Element pippenger_unsafe(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); + +template +typename Curve::Element pippenger_without_endomorphism_basis_points(typename Curve::ScalarField* scalars, + typename Curve::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); + +// Explicit instantiation +extern template uint32_t construct_addition_chains(affine_product_runtime_state& state, + bool empty_bucket_counts = true); + +extern template void add_affine_points(curve::BN254::AffineElement* points, + const size_t num_points, + curve::BN254::BaseField* scratch_space); + +extern template void add_affine_points_with_edge_cases(curve::BN254::AffineElement* points, + const size_t num_points, + curve::BN254::BaseField* scratch_space); + +extern template void evaluate_addition_chains(affine_product_runtime_state& state, + const size_t max_bucket_bits, + bool handle_edge_cases); +extern template curve::BN254::Element pippenger_internal(curve::BN254::AffineElement* points, + curve::BN254::ScalarField* scalars, + const size_t num_initial_points, + pippenger_runtime_state& state, + bool handle_edge_cases); + +extern template curve::BN254::Element evaluate_pippenger_rounds( + pippenger_runtime_state& state, + curve::BN254::AffineElement* points, + const size_t num_points, + bool handle_edge_cases = false); + +extern template curve::BN254::AffineElement* reduce_buckets( + affine_product_runtime_state& state, bool first_round = true, bool handle_edge_cases = false); + +extern template curve::BN254::Element pippenger(curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_points, + pippenger_runtime_state& state, + bool handle_edge_cases = true); + +extern template curve::BN254::Element pippenger_unsafe(curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); + +extern template curve::BN254::Element pippenger_without_endomorphism_basis_points( + curve::BN254::ScalarField* scalars, + curve::BN254::AffineElement* points, + const size_t num_initial_points, + pippenger_runtime_state& state); } // namespace scalar_multiplication -} // namespace grumpkin +} // namespace barretenberg diff --git a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp index fa2e5f15c6..e2b09d3d7e 100644 --- a/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp +++ b/cpp/src/barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.test.cpp @@ -1,46 +1,36 @@ -#include -#include -#include - #include "pippenger.hpp" #include "scalar_multiplication.hpp" -#include "barretenberg/common/mem.hpp" +#include #include "barretenberg/common/test.hpp" #include "barretenberg/srs/io.hpp" +#include + #include "barretenberg/numeric/random/engine.hpp" -// paths are relative to cpp/build/ -std::string GRUMPKIN_SRS_PATH = "../srs_db/grumpkin"; +#include "barretenberg/common/mem.hpp" + +#define BARRETENBERG_SRS_PATH "../srs_db/ignition" -using namespace grumpkin; -using namespace grumpkin::scalar_multiplication; +using namespace barretenberg; +using namespace barretenberg::scalar_multiplication; namespace { auto& engine = numeric::random::get_debug_engine(); } -TEST(grumpkin_scalar_multiplication, fake_transcript_io) -{ - size_t file_num = 0; - std::string transcript_path = io::get_transcript_path(GRUMPKIN_SRS_PATH, file_num); - - std::vector srs(3); - grumpkin::io::read_transcript_g1(&srs[0], /*degree=*/3, GRUMPKIN_SRS_PATH); - // the SRS is [x^i]_1 where x = 2 - EXPECT_EQ(static_cast(g1::one), srs[0]); - EXPECT_EQ(static_cast(g1::one + g1::one), srs[1]); - EXPECT_EQ(static_cast(g1::one + g1::one + g1::one + g1::one), srs[2]); -} +using Curve = curve::BN254; -TEST(grumpkin_scalar_multiplication, reduce_buckets_simple) +TEST(scalar_multiplication, reduce_buckets_simple) { constexpr size_t num_points = 128; - auto pippenger = Pippenger(GRUMPKIN_SRS_PATH, num_points / 2); + g2::affine_element g2_x; + io::read_transcript_g2(g2_x, BARRETENBERG_SRS_PATH); + auto pippenger = Pippenger(BARRETENBERG_SRS_PATH, num_points / 2); auto monomials = pippenger.get_point_table(); std::vector point_schedule(scalar_multiplication::point_table_size(num_points / 2)); std::array bucket_empty_status; - + // 16 buckets, each bucket has one point std::array transcript; std::array transcript_points; transcript_points[0] = 0x0; @@ -197,14 +187,14 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets_simple) std::array bucket_counts; std::array bit_offsets = { 0 }; - scalar_multiplication::affine_product_runtime_state product_state{ + scalar_multiplication::affine_product_runtime_state product_state{ &monomials[0], &point_pairs[0], &output_buckets[0], &scratch_space[0], &bucket_counts[0], &bit_offsets[0], &point_schedule[0], num_points, 2, &bucket_empty_status[0] }; - g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true); + g1::affine_element* output = scalar_multiplication::reduce_buckets(product_state, true); for (size_t i = 0; i < product_state.num_buckets; ++i) { expected[i] = expected[i].normalize(); @@ -213,7 +203,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets_simple) } } -TEST(grumpkin_scalar_multiplication, reduce_buckets) +TEST(scalar_multiplication, reduce_buckets) { constexpr size_t num_initial_points = 1 << 12; constexpr size_t num_points = num_initial_points * 2; @@ -235,10 +225,10 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets) memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); - // WORKTODO: unify by using 0 g2 elts - grumpkin::io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); + g2::affine_element g2_x; + io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); + scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); @@ -246,10 +236,10 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets) scalars[i] = fr::random_element(); } - scalar_multiplication::pippenger_runtime_state state(num_initial_points); + scalar_multiplication::pippenger_runtime_state state(num_initial_points); std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( + scalar_multiplication::compute_wnaf_states( state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); @@ -276,19 +266,19 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets) const size_t last_bucket = point_schedule_copy[num_points - 1] & 0x7fffffffULL; const size_t num_buckets = last_bucket - first_bucket + 1; - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - &state.point_schedule[num_points], - num_points, - static_cast(num_buckets), - bucket_empty_status }; + scalar_multiplication::affine_product_runtime_state product_state{ monomials, + point_pairs, + scratch_points, + scratch_field, + bucket_counts, + &bit_offsets[0], + &state.point_schedule[num_points], + num_points, + static_cast(num_buckets), + bucket_empty_status }; start = std::chrono::steady_clock::now(); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); + // scalar_multiplication::scalar_multiplication_internal(state, monomials); end = std::chrono::steady_clock::now(); diff = std::chrono::duration_cast(end - start); std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; @@ -309,7 +299,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets) size_t it = 0; - g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true); + g1::affine_element* result_buckets = scalar_multiplication::reduce_buckets(product_state, true); printf("num buckets = %zu \n", num_buckets); for (size_t i = 0; i < num_buckets; ++i) { @@ -334,7 +324,7 @@ TEST(grumpkin_scalar_multiplication, reduce_buckets) } // This test intermittenly fails. -TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) +TEST(scalar_multiplication, DISABLED_reduce_buckets_basic) { constexpr size_t num_initial_points = 1 << 20; constexpr size_t num_points = num_initial_points * 2; @@ -352,7 +342,8 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) memset((void*)scratch_field, 0x00, num_points * sizeof(fq)); memset((void*)bucket_empty_status, 0x00, num_points * sizeof(bool)); - io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); + g2::affine_element g2_x; + io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); @@ -362,11 +353,11 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) fr::__copy(source_scalar, scalars[i]); } - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); + scalar_multiplication::pippenger_runtime_state state(num_initial_points); + scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( + scalar_multiplication::compute_wnaf_states( state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); @@ -386,20 +377,20 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) const size_t last_bucket = state.point_schedule[num_points - 1] & 0x7fffffffULL; const size_t num_buckets = last_bucket - first_bucket + 1; - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - point_pairs, - scratch_points, - scratch_field, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - (uint32_t)state.round_counts[0], - static_cast(num_buckets), - bucket_empty_status }; + scalar_multiplication::affine_product_runtime_state product_state{ monomials, + point_pairs, + scratch_points, + scratch_field, + bucket_counts, + &bit_offsets[0], + state.point_schedule, + (uint32_t)state.round_counts[0], + static_cast(num_buckets), + bucket_empty_status }; start = std::chrono::steady_clock::now(); - scalar_multiplication::reduce_buckets(product_state, true); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); + scalar_multiplication::reduce_buckets(product_state, true); + // scalar_multiplication::scalar_multiplication_internal(state, monomials); end = std::chrono::steady_clock::now(); diff = std::chrono::duration_cast(end - start); std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; @@ -413,7 +404,7 @@ TEST(grumpkin_scalar_multiplication, DISABLED_reduce_buckets_basic) aligned_free(bucket_counts); } -TEST(grumpkin_scalar_multiplication, add_affine_points) +TEST(scalar_multiplication, add_affine_points) { constexpr size_t num_points = 20; g1::affine_element* points = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); @@ -434,7 +425,7 @@ TEST(grumpkin_scalar_multiplication, add_affine_points) points_copy[count + 1] = points_copy[count + 1].normalize(); } - scalar_multiplication::add_affine_points(points, num_points, scratch_space); + scalar_multiplication::add_affine_points(points, num_points, scratch_space); for (size_t i = num_points - 1; i > num_points - 1 - (num_points / 2); --i) { EXPECT_EQ((points[i].x == points_copy[i].x), true); EXPECT_EQ((points[i].y == points_copy[i].y), true); @@ -445,13 +436,14 @@ TEST(grumpkin_scalar_multiplication, add_affine_points) aligned_free(scratch_space); } -TEST(grumpkin_scalar_multiplication, construct_addition_chains) +TEST(scalar_multiplication, construct_addition_chains) { constexpr size_t num_initial_points = 1 << 20; constexpr size_t num_points = num_initial_points * 2; g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (num_points))); - io::read_transcript(monomials, num_initial_points, GRUMPKIN_SRS_PATH); + g2::affine_element g2_x; + io::read_transcript(monomials, g2_x, num_initial_points, BARRETENBERG_SRS_PATH); fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * num_initial_points)); @@ -461,11 +453,11 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains) fr::__copy(source_scalar, scalars[i]); } - scalar_multiplication::pippenger_runtime_state state(num_initial_points); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); + scalar_multiplication::pippenger_runtime_state state(num_initial_points); + scalar_multiplication::generate_pippenger_point_table(monomials, monomials, num_initial_points); std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - scalar_multiplication::compute_wnaf_states( + scalar_multiplication::compute_wnaf_states( state.point_schedule, state.skew_table, state.round_counts, scalars, num_initial_points); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::chrono::milliseconds diff = std::chrono::duration_cast(end - start); @@ -485,22 +477,23 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains) const size_t last_bucket = state.point_schedule[state.round_counts[0] - 1] & 0x7fffffffULL; const size_t num_buckets = last_bucket - first_bucket + 1; - scalar_multiplication::affine_product_runtime_state product_state{ monomials, - monomials, - monomials, - nullptr, - bucket_counts, - &bit_offsets[0], - state.point_schedule, - num_points, - static_cast(num_buckets), - bucket_empty_status }; + scalar_multiplication::affine_product_runtime_state product_state{ monomials, + monomials, + monomials, + nullptr, + bucket_counts, + &bit_offsets[0], + state.point_schedule, + static_cast( + state.round_counts[0]), + static_cast(num_buckets), + bucket_empty_status }; start = std::chrono::steady_clock::now(); - scalar_multiplication::construct_addition_chains(product_state, true); - // scalar_multiplication::scalar_multiplication_internal(state, monomials); + scalar_multiplication::construct_addition_chains(product_state, true); end = std::chrono::steady_clock::now(); diff = std::chrono::duration_cast(end - start); + info("construct addition chains: ", diff.count(), "ms"); std::cout << "scalar mul: " << diff.count() << "ms" << std::endl; aligned_free(bucket_empty_status); @@ -509,7 +502,7 @@ TEST(grumpkin_scalar_multiplication, construct_addition_chains) aligned_free(bucket_counts); } -TEST(grumpkin_scalar_multiplication, endomorphism_split) +TEST(scalar_multiplication, endomorphism_split) { fr scalar = fr::random_element(); @@ -545,7 +538,7 @@ TEST(grumpkin_scalar_multiplication, endomorphism_split) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, radix_sort) +TEST(scalar_multiplication, radix_sort) { // check that our radix sort correctly sorts! constexpr size_t target_degree = 1 << 8; @@ -558,8 +551,8 @@ TEST(grumpkin_scalar_multiplication, radix_sort) fr::__copy(source_scalar, scalars[i]); } - scalar_multiplication::pippenger_runtime_state state(target_degree); - scalar_multiplication::compute_wnaf_states( + scalar_multiplication::pippenger_runtime_state state(target_degree); + scalar_multiplication::compute_wnaf_states( state.point_schedule, state.skew_table, state.round_counts, scalars, target_degree); uint64_t* wnaf_copy = (uint64_t*)(aligned_alloc(64, sizeof(uint64_t) * target_degree * 2 * num_rounds)); @@ -590,7 +583,7 @@ TEST(grumpkin_scalar_multiplication, radix_sort) free(wnaf_copy); } -HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs) +HEAVY_TEST(scalar_multiplication, oversized_inputs) { // for point ranges with more than 1 << 20 points, we split into chunks of smaller multi-exps. // Check that this is done correctly @@ -598,12 +591,13 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs) size_t target_degree = 1200000; g1::affine_element* monomials = (g1::affine_element*)(aligned_alloc(64, sizeof(g1::affine_element) * (2 * target_degree))); - io::read_transcript(monomials, transcript_degree, GRUMPKIN_SRS_PATH); + g2::affine_element g2_x; + io::read_transcript(monomials, g2_x, transcript_degree, BARRETENBERG_SRS_PATH); memcpy((void*)(monomials + (2 * transcript_degree)), (void*)monomials, ((2 * target_degree - 2 * transcript_degree) * sizeof(g1::affine_element))); - scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree); + scalar_multiplication::generate_pippenger_point_table(monomials, monomials, target_degree); fr* scalars = (fr*)(aligned_alloc(64, sizeof(fr) * target_degree)); @@ -613,17 +607,17 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs) accumulator *= source_scalar; fr::__copy(accumulator, scalars[i]); } - scalar_multiplication::pippenger_runtime_state state(target_degree); + scalar_multiplication::pippenger_runtime_state state(target_degree); - g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state); + g1::element first = scalar_multiplication::pippenger(scalars, monomials, target_degree, state); first = first.normalize(); for (size_t i = 0; i < target_degree; ++i) { scalars[i].self_neg(); } - scalar_multiplication::pippenger_runtime_state state_2(target_degree); + scalar_multiplication::pippenger_runtime_state state_2(target_degree); - g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2); + g1::element second = scalar_multiplication::pippenger(scalars, monomials, target_degree, state_2); second = second.normalize(); EXPECT_EQ((first.z == second.z), true); @@ -635,7 +629,7 @@ HEAVY_TEST(grumpkin_scalar_multiplication, oversized_inputs) aligned_free(scalars); } -TEST(grumpkin_scalar_multiplication, undersized_inputs) +TEST(scalar_multiplication, undersized_inputs) { // we fall back to traditional scalar multiplication algorithm for small input sizes. // Check this is done correctly @@ -658,11 +652,11 @@ TEST(grumpkin_scalar_multiplication, undersized_inputs) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); + g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -671,7 +665,7 @@ TEST(grumpkin_scalar_multiplication, undersized_inputs) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger) +TEST(scalar_multiplication, pippenger) { constexpr size_t num_points = 8192; @@ -692,10 +686,10 @@ TEST(grumpkin_scalar_multiplication, pippenger) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); + g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -704,7 +698,7 @@ TEST(grumpkin_scalar_multiplication, pippenger) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl) +TEST(scalar_multiplication, pippenger_edge_case_dbl) { constexpr size_t num_points = 128; @@ -728,9 +722,9 @@ TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl) if (!expected.is_point_at_infinity()) { expected = expected.normalize(); } - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); + g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -739,7 +733,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_edge_case_dbl) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_short_inputs) +TEST(scalar_multiplication, pippenger_short_inputs) { constexpr size_t num_points = 8192; @@ -780,10 +774,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_short_inputs) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); + g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -792,7 +786,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_short_inputs) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_unsafe) +TEST(scalar_multiplication, pippenger_unsafe) { constexpr size_t num_points = 8192; @@ -812,10 +806,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); + scalar_multiplication::pippenger_runtime_state state(num_points); + g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -824,7 +818,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs) +TEST(scalar_multiplication, pippenger_unsafe_short_inputs) { constexpr size_t num_points = 8192; @@ -866,10 +860,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); + g1::element result = scalar_multiplication::pippenger_unsafe(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -878,7 +872,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_unsafe_short_inputs) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_one) +TEST(scalar_multiplication, pippenger_one) { size_t num_points = 1; @@ -899,10 +893,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_one) expected += temp; } expected = expected.normalize(); - scalar_multiplication::generate_pippenger_point_table(points, points, num_points); - scalar_multiplication::pippenger_runtime_state state(num_points); + scalar_multiplication::generate_pippenger_point_table(points, points, num_points); + scalar_multiplication::pippenger_runtime_state state(num_points); - g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); + g1::element result = scalar_multiplication::pippenger(scalars, points, num_points, state); result = result.normalize(); aligned_free(scalars); @@ -911,14 +905,14 @@ TEST(grumpkin_scalar_multiplication, pippenger_one) EXPECT_EQ(result == expected, true); } -TEST(grumpkin_scalar_multiplication, pippenger_zero_points) +TEST(scalar_multiplication, pippenger_zero_points) { fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); g1::affine_element* points = (g1::affine_element*)aligned_alloc(32, sizeof(g1::affine_element) * 2 + 1); - scalar_multiplication::pippenger_runtime_state state(0); - g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state); + scalar_multiplication::pippenger_runtime_state state(0); + g1::element result = scalar_multiplication::pippenger(scalars, points, 0, state); aligned_free(scalars); aligned_free(points); @@ -926,7 +920,7 @@ TEST(grumpkin_scalar_multiplication, pippenger_zero_points) EXPECT_EQ(result.is_point_at_infinity(), true); } -TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero) +TEST(scalar_multiplication, pippenger_mul_by_zero) { fr* scalars = (fr*)aligned_alloc(32, sizeof(fr)); @@ -934,10 +928,10 @@ TEST(grumpkin_scalar_multiplication, pippenger_mul_by_zero) scalars[0] = fr::zero(); points[0] = g1::affine_one; - scalar_multiplication::generate_pippenger_point_table(points, points, 1); + scalar_multiplication::generate_pippenger_point_table(points, points, 1); - scalar_multiplication::pippenger_runtime_state state(1); - g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state); + scalar_multiplication::pippenger_runtime_state state(1); + g1::element result = scalar_multiplication::pippenger(scalars, points, 1, state); aligned_free(scalars); aligned_free(points); diff --git a/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp b/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp index 5436fcb8f6..070222911c 100644 --- a/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp +++ b/cpp/src/barretenberg/ecc/curves/secp256k1/secp256k1.hpp @@ -153,7 +153,8 @@ class SECP256K1 { public: using ScalarField = secp256k1::fr; using BaseField = secp256k1::fq; - using ProjectiveElement = typename secp256k1::g1::element; - using AffineElement = typename secp256k1::g1::affine_element; + using Group = secp256k1::g1; + using Element = typename Group::element; + using AffineElement = typename Group::affine_element; }; } // namespace curve \ No newline at end of file diff --git a/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp b/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp index 4a9e0b7c90..2d04e47c90 100644 --- a/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp +++ b/cpp/src/barretenberg/ecc/curves/secp256r1/secp256r1.hpp @@ -140,7 +140,8 @@ class SECP256R1 { public: using ScalarField = secp256r1::fr; using BaseField = secp256r1::fq; - using ProjectiveElement = typename secp256r1::g1::element; - using AffineElement = typename secp256r1::g1::affine_element; + using Group = secp256r1::g1; + using Element = typename Group::element; + using AffineElement = typename Group::affine_element; }; } // namespace curve \ No newline at end of file diff --git a/cpp/src/barretenberg/honk/pcs/commitment_key.hpp b/cpp/src/barretenberg/honk/pcs/commitment_key.hpp index 5d4e32c80d..2f95c33f2c 100644 --- a/cpp/src/barretenberg/honk/pcs/commitment_key.hpp +++ b/cpp/src/barretenberg/honk/pcs/commitment_key.hpp @@ -8,7 +8,7 @@ #include "barretenberg/polynomials/polynomial_arithmetic.hpp" #include "barretenberg/polynomials/polynomial.hpp" #include "barretenberg/srs/reference_string/file_reference_string.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/ecc/curves/bn254/pairing.hpp" #include "barretenberg/numeric/bitop/pow.hpp" @@ -61,12 +61,12 @@ class CommitmentKey { { const size_t degree = polynomial.size(); ASSERT(degree <= srs.get_monomial_size()); - return barretenberg::scalar_multiplication::pippenger_unsafe( + return barretenberg::scalar_multiplication::pippenger_unsafe( const_cast(polynomial.data()), srs.get_monomial_points(), degree, pippenger_runtime_state); }; private: - barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; + barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; proof_system::FileReferenceString srs; }; @@ -239,11 +239,11 @@ class CommitmentKey { { const size_t degree = polynomial.size(); ASSERT(degree <= srs.get_monomial_size()); - return barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + return barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( const_cast(polynomial.data()), srs.get_monomial_points(), degree, pippenger_runtime_state); }; - barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; + barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; proof_system::FileReferenceString srs; }; @@ -268,7 +268,7 @@ class VerificationKey { , srs(num_points, std::string(path)) {} - barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; + barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; proof_system::FileReferenceString srs; }; diff --git a/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp b/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp index a7eaf54f91..c571cb5d89 100644 --- a/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp +++ b/cpp/src/barretenberg/honk/pcs/ipa/ipa.hpp @@ -2,7 +2,7 @@ #include #include #include -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/honk/pcs/commitment_key.hpp" #include "barretenberg/stdlib/primitives/curves/bn254.hpp" @@ -88,13 +88,15 @@ template class InnerProductArgument { inner_prod_R += a_vec[round_size + j] * b_vec[j]; } // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator - L_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( - &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); + L_elements[i] = + barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + &a_vec[0], &G_vec_local[round_size], round_size, ck->pippenger_runtime_state); L_elements[i] += aux_generator * inner_prod_L; // R_i = < a_vec_hi, G_vec_lo > + inner_prod_R * aux_generator - R_elements[i] = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( - &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state); + R_elements[i] = + barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + &a_vec[round_size], &G_vec_local[0], round_size, ck->pippenger_runtime_state); R_elements[i] += aux_generator * inner_prod_R; std::string index = std::to_string(i); @@ -178,8 +180,9 @@ template class InnerProductArgument { msm_scalars[2 * i] = round_challenges[i].sqr(); msm_scalars[2 * i + 1] = round_challenges_inv[i].sqr(); } - Commitment LR_sums = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( - &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state); + Commitment LR_sums = + barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + &msm_scalars[0], &msm_elements[0], pippenger_size, vk->pippenger_runtime_state); Commitment C_zero = C_prime + LR_sums; /** @@ -218,7 +221,7 @@ template class InnerProductArgument { for (size_t i = 0; i < poly_degree; i++) { G_vec_local[i] = srs_elements[i]; } - auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( + auto G_zero = barretenberg::scalar_multiplication::pippenger_without_endomorphism_basis_points( &s_vec[0], &G_vec_local[0], poly_degree, vk->pippenger_runtime_state); auto a_zero = transcript.template receive_from_prover("IPA:a_0"); diff --git a/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp b/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp index 628a969d26..8b9de4e124 100644 --- a/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp +++ b/cpp/src/barretenberg/honk/proof_system/ultra_verifier.cpp @@ -2,10 +2,10 @@ #include "barretenberg/honk/transcript/transcript.hpp" #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/honk/flavor/standard.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/honk/utils/power_polynomial.hpp" -#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-variable" // TODO(Cody): this needs to go. using namespace barretenberg; using namespace proof_system::honk::sumcheck; diff --git a/cpp/src/barretenberg/honk/proof_system/verifier.cpp b/cpp/src/barretenberg/honk/proof_system/verifier.cpp index 3506f15aaa..259c9d13db 100644 --- a/cpp/src/barretenberg/honk/proof_system/verifier.cpp +++ b/cpp/src/barretenberg/honk/proof_system/verifier.cpp @@ -2,7 +2,7 @@ #include "barretenberg/honk/transcript/transcript.hpp" #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/honk/flavor/standard.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/honk/utils/power_polynomial.hpp" using namespace barretenberg; diff --git a/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp b/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp index 5ee4b90fad..ba8e78c037 100644 --- a/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp +++ b/cpp/src/barretenberg/join_split_example/proofs/join_split/c_bind.cpp @@ -3,6 +3,7 @@ #include "c_bind.h" #include "join_split.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" // WORKTODO: needed? #include "compute_signing_data.hpp" #include "../mock/mock_circuit.hpp" #include "barretenberg/common/streams.hpp" @@ -59,7 +60,7 @@ WASM_EXPORT uint32_t join_split__get_new_proving_key_data(uint8_t** output) WASM_EXPORT void join_split__init_verification_key(void* pippenger, uint8_t const* g2x) { auto crs_factory = std::make_unique( - reinterpret_cast(pippenger), g2x); + reinterpret_cast*>(pippenger), g2x); init_verification_key(std::move(crs_factory)); } diff --git a/cpp/src/barretenberg/plonk/composer/composer_base.cpp b/cpp/src/barretenberg/plonk/composer/composer_base.cpp index 46e404a57e..e3698dc434 100644 --- a/cpp/src/barretenberg/plonk/composer/composer_base.cpp +++ b/cpp/src/barretenberg/plonk/composer/composer_base.cpp @@ -330,10 +330,10 @@ std::shared_ptr ComposerBase::compute_verification_key_base( // Commit to the constraint selector polynomial and insert the commitment in the verification key. auto selector_poly_commitment = g1::affine_element( - scalar_multiplication::pippenger(selector_poly_coefficients, - proving_key->reference_string->get_monomial_points(), - proving_key->circuit_size, - proving_key->pippenger_runtime_state)); + scalar_multiplication::pippenger(selector_poly_coefficients, + proving_key->reference_string->get_monomial_points(), + proving_key->circuit_size, + proving_key->pippenger_runtime_state)); circuit_verification_key->commitments.insert({ selector_commitment_label, selector_poly_commitment }); } diff --git a/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp b/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp index c367d9ccd1..5b8ee42d10 100644 --- a/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp +++ b/cpp/src/barretenberg/plonk/composer/splitting_tmp/composer_helper/turbo_plonk_composer_helper.cpp @@ -1,6 +1,6 @@ #include "turbo_plonk_composer_helper.hpp" #include "barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp" #include "barretenberg/plonk/proof_system/widgets/transition_widgets/turbo_arithmetic_widget.hpp" diff --git a/cpp/src/barretenberg/plonk/composer/standard_composer.cpp b/cpp/src/barretenberg/plonk/composer/standard_composer.cpp index 1158590acf..dc04716864 100644 --- a/cpp/src/barretenberg/plonk/composer/standard_composer.cpp +++ b/cpp/src/barretenberg/plonk/composer/standard_composer.cpp @@ -1,6 +1,6 @@ #include "standard_composer.hpp" #include "barretenberg/plonk/proof_system/types/prover_settings.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/plonk/proof_system/widgets/transition_widgets/arithmetic_widget.hpp" #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp" diff --git a/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp b/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp index f3339cf147..20f686e4d3 100644 --- a/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp +++ b/cpp/src/barretenberg/plonk/composer/turbo_composer.cpp @@ -1,5 +1,5 @@ #include "turbo_composer.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/numeric/bitop/get_msb.hpp" #include "barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget.hpp" #include "barretenberg/plonk/proof_system/widgets/transition_widgets/turbo_arithmetic_widget.hpp" diff --git a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp index ce0f4eac7b..fab8b9c575 100644 --- a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp +++ b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp @@ -1,6 +1,6 @@ #include "ultra_composer.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/numeric/bitop/get_msb.hpp" #include #include diff --git a/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp b/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp index b7730585e1..32a14c0ac6 100644 --- a/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp +++ b/cpp/src/barretenberg/plonk/proof_system/prover/prover.cpp @@ -3,7 +3,7 @@ #include "barretenberg/plonk/proof_system/types/prover_settings.hpp" #include "barretenberg/polynomials/polynomial.hpp" #include -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/iterate_over_domain.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" diff --git a/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp b/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp index f3198a5cbb..f30b0bbf9f 100644 --- a/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp +++ b/cpp/src/barretenberg/plonk/proof_system/proving_key/proving_key.hpp @@ -1,14 +1,15 @@ #pragma once -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/runtime_states.hpp" #include +#include + +#include "barretenberg/ecc/curves/scalar_multiplication/runtime_states.hpp" #include "barretenberg/polynomials/evaluation_domain.hpp" #include "barretenberg/polynomials/polynomial.hpp" - #include "barretenberg/proof_system/polynomial_store/polynomial_store.hpp" #include "barretenberg/srs/reference_string/reference_string.hpp" #include "barretenberg/plonk/proof_system/constants.hpp" #include "barretenberg/plonk/proof_system/types/polynomial_manifest.hpp" -#include +#include "barretenberg/ecc/curves/bn254/bn254.hpp" namespace proof_system::plonk { @@ -62,7 +63,7 @@ struct proving_key { barretenberg::polynomial quotient_polynomial_parts[plonk::NUM_QUOTIENT_PARTS]; - barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; + barretenberg::scalar_multiplication::pippenger_runtime_state pippenger_runtime_state; PolynomialManifest polynomial_manifest; diff --git a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp index bee400fd36..6f37c631ac 100644 --- a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp +++ b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.cpp @@ -5,7 +5,7 @@ #include "../utils/kate_verification.hpp" #include "barretenberg/ecc/curves/bn254/fq12.hpp" #include "barretenberg/ecc/curves/bn254/pairing.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" using namespace barretenberg; @@ -176,12 +176,13 @@ template bool VerifierBase::verify size_t num_elements = elements.size(); elements.resize(num_elements * 2); - barretenberg::scalar_multiplication::generate_pippenger_point_table(&elements[0], &elements[0], num_elements); - scalar_multiplication::pippenger_runtime_state state(num_elements); + barretenberg::scalar_multiplication::generate_pippenger_point_table( + &elements[0], &elements[0], num_elements); + scalar_multiplication::pippenger_runtime_state state(num_elements); g1::element P[2]; - P[0] = barretenberg::scalar_multiplication::pippenger(&scalars[0], &elements[0], num_elements, state); + P[0] = barretenberg::scalar_multiplication::pippenger(&scalars[0], &elements[0], num_elements, state); P[1] = -(g1::element(PI_Z_OMEGA) * separator_challenge + PI_Z); if (key->contains_recursive_proof) { diff --git a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp index 701a465189..eccd0ed6fa 100644 --- a/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp +++ b/cpp/src/barretenberg/plonk/proof_system/verifier/verifier.test.cpp @@ -5,7 +5,7 @@ #include "../../../transcript/transcript.hpp" #include "barretenberg/plonk/composer/standard_composer.hpp" #include "verifier.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include #include "barretenberg/srs/reference_string/file_reference_string.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" @@ -29,15 +29,15 @@ plonk::Verifier generate_verifier(std::shared_ptr circuit_proving_k poly_coefficients[7] = circuit_proving_key->polynomial_store.get("sigma_3").get_coefficients(); std::vector commitments; - scalar_multiplication::pippenger_runtime_state state(circuit_proving_key->circuit_size); + scalar_multiplication::pippenger_runtime_state state(circuit_proving_key->circuit_size); commitments.resize(8); for (size_t i = 0; i < 8; ++i) { commitments[i] = g1::affine_element( - scalar_multiplication::pippenger(poly_coefficients[i], - circuit_proving_key->reference_string->get_monomial_points(), - circuit_proving_key->circuit_size, - state)); + scalar_multiplication::pippenger(poly_coefficients[i], + circuit_proving_key->reference_string->get_monomial_points(), + circuit_proving_key->circuit_size, + state)); } auto crs = std::make_shared("../srs_db/ignition"); diff --git a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp index f59863bc5e..ec2b12d46b 100644 --- a/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp +++ b/cpp/src/barretenberg/plonk/proof_system/widgets/random_widgets/permutation_widget_impl.hpp @@ -1,6 +1,6 @@ #pragma once #include "barretenberg/common/mem.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/plonk/proof_system/proving_key/proving_key.hpp" #include "barretenberg/plonk/proof_system/public_inputs/public_inputs.hpp" #include "barretenberg/transcript/transcript.hpp" @@ -229,7 +229,7 @@ void ProverPermutationWidget(state.range(0)); state.PauseTiming(); - scalar_multiplication::pippenger_runtime_state run_state(num_points); + scalar_multiplication::pippenger_runtime_state run_state(num_points); state.ResumeTiming(); // uint64_t before = rdtsc(); - scalar_multiplication::pippenger(&globals.scalars[0], &globals.monomials[0], num_points, run_state); + scalar_multiplication::pippenger( + &globals.scalars[0], &globals.monomials[0], num_points, run_state); // uint64_t after = rdtsc(); // count += (after - before); // ++i; @@ -143,7 +144,7 @@ void unsafe_pippenger_bench(State& state) noexcept uint64_t i = 0; for (auto _ : state) { state.PauseTiming(); - scalar_multiplication::pippenger_runtime_state run_state(num_points); + scalar_multiplication::pippenger_runtime_state run_state(num_points); state.ResumeTiming(); uint64_t before = rdtsc(); @@ -164,28 +165,28 @@ void new_plonk_scalar_multiplications_bench(State& state) noexcept uint64_t k = 0; for (auto _ : state) { state.PauseTiming(); - scalar_multiplication::pippenger_runtime_state run_state(MAX_GATES); + scalar_multiplication::pippenger_runtime_state run_state(MAX_GATES); state.ResumeTiming(); uint64_t before = rdtsc(); - g1::element a = - scalar_multiplication::pippenger(&globals.scalars[0], &globals.monomials[0], MAX_GATES, run_state); - g1::element b = - scalar_multiplication::pippenger(&globals.scalars[1], &globals.monomials[0], MAX_GATES, run_state); - g1::element c = - scalar_multiplication::pippenger(&globals.scalars[2], &globals.monomials[0], MAX_GATES, run_state); - g1::element d = - scalar_multiplication::pippenger(&globals.scalars[3], &globals.monomials[0], MAX_GATES, run_state); - g1::element e = - scalar_multiplication::pippenger(&globals.scalars[4], &globals.monomials[0], MAX_GATES, run_state); - g1::element f = - scalar_multiplication::pippenger(&globals.scalars[5], &globals.monomials[0], MAX_GATES, run_state); - g1::element g = - scalar_multiplication::pippenger(&globals.scalars[6], &globals.monomials[0], MAX_GATES, run_state); - g1::element h = - scalar_multiplication::pippenger(&globals.scalars[7], &globals.monomials[0], MAX_GATES, run_state); - g1::element i = - scalar_multiplication::pippenger(&globals.scalars[8], &globals.monomials[0], MAX_GATES, run_state); + g1::element a = scalar_multiplication::pippenger( + &globals.scalars[0], &globals.monomials[0], MAX_GATES, run_state); + g1::element b = scalar_multiplication::pippenger( + &globals.scalars[1], &globals.monomials[0], MAX_GATES, run_state); + g1::element c = scalar_multiplication::pippenger( + &globals.scalars[2], &globals.monomials[0], MAX_GATES, run_state); + g1::element d = scalar_multiplication::pippenger( + &globals.scalars[3], &globals.monomials[0], MAX_GATES, run_state); + g1::element e = scalar_multiplication::pippenger( + &globals.scalars[4], &globals.monomials[0], MAX_GATES, run_state); + g1::element f = scalar_multiplication::pippenger( + &globals.scalars[5], &globals.monomials[0], MAX_GATES, run_state); + g1::element g = scalar_multiplication::pippenger( + &globals.scalars[6], &globals.monomials[0], MAX_GATES, run_state); + g1::element h = scalar_multiplication::pippenger( + &globals.scalars[7], &globals.monomials[0], MAX_GATES, run_state); + g1::element i = scalar_multiplication::pippenger( + &globals.scalars[8], &globals.monomials[0], MAX_GATES, run_state); uint64_t after = rdtsc(); count += (after - before); ++k; diff --git a/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp b/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp index 8bc4b461e2..c8a852015e 100644 --- a/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp +++ b/cpp/src/barretenberg/proof_system/circuit_constructors/turbo_circuit_constructor.cpp @@ -1,5 +1,5 @@ #include "turbo_circuit_constructor.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" // WORKTODO: needed? #include "barretenberg/numeric/bitop/get_msb.hpp" using namespace barretenberg; diff --git a/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp b/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp index 038716f4f8..3214799518 100644 --- a/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp +++ b/cpp/src/barretenberg/proof_system/work_queue/work_queue.cpp @@ -1,6 +1,6 @@ #include "work_queue.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/scalar_multiplication.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" namespace proof_system::plonk { @@ -206,8 +206,8 @@ void work_queue::process_queue() barretenberg::g1::affine_element* srs_points = key->reference_string->get_monomial_points(); // Run pippenger multi-scalar multiplication. - auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state(msm_size); - barretenberg::g1::affine_element result(barretenberg::scalar_multiplication::pippenger_unsafe( + auto runtime_state = barretenberg::scalar_multiplication::pippenger_runtime_state(msm_size); + barretenberg::g1::affine_element result(barretenberg::scalar_multiplication::pippenger_unsafe( item.mul_scalars, srs_points, msm_size, runtime_state)); transcript->add_element(item.tag, result.to_buffer()); diff --git a/cpp/src/barretenberg/srs/io.cpp b/cpp/src/barretenberg/srs/io.cpp index 6e77540690..3b51adaf67 100644 --- a/cpp/src/barretenberg/srs/io.cpp +++ b/cpp/src/barretenberg/srs/io.cpp @@ -35,9 +35,9 @@ void read_manifest(std::string const& filename, Manifest& manifest) manifest.start_from = ntohl(manifest.start_from); } -void byteswap(g1::affine_element* elements, size_t elements_size) +template void byteswap(typename Curve::AffineElement* elements, size_t elements_size) { - constexpr size_t bytes_per_element = sizeof(g1::affine_element); + constexpr size_t bytes_per_element = sizeof(typename Curve::AffineElement); size_t num_elements = elements_size / bytes_per_element; if (is_little_endian()) { @@ -59,7 +59,7 @@ void byteswap(g1::affine_element* elements, size_t elements_size) void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size) { memcpy((void*)elements, (void*)buffer, buffer_size); - byteswap(elements, buffer_size); + byteswap(elements, buffer_size); } void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size) @@ -138,7 +138,8 @@ bool is_file_exist(std::string const& fileName) return infile.good(); } -void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir) +template +void read_transcript_g1(typename Curve::AffineElement* monomials, size_t degree, std::string const& dir) { size_t num = 0; size_t num_read = 0; @@ -158,7 +159,7 @@ void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::strin // We must pass the size actually read to the second call, not the desired // g1_buffer_size as the file may have been smaller than this. read_file_into_buffer(buffer, size, path, offset, g1_buffer_size); - byteswap(&monomials[num_read], size); + byteswap(&monomials[num_read], size); num_read += num_to_read; path = get_transcript_path(dir, ++num); @@ -215,7 +216,7 @@ void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir) void read_transcript(g1::affine_element* monomials, g2::affine_element& g2_x, size_t degree, std::string const& path) { - read_transcript_g1(monomials, degree, path); + read_transcript_g1(monomials, degree, path); read_transcript_g2(g2_x, path); } @@ -363,31 +364,31 @@ void read_manifest(std::string const& filename, Manifest& manifest) manifest.start_from = ntohl(manifest.start_from); } -void byteswap(g1::affine_element* elements, size_t elements_size) -{ - constexpr size_t bytes_per_element = sizeof(g1::affine_element); - size_t num_elements = elements_size / bytes_per_element; +// void byteswap(g1::affine_element* elements, size_t elements_size) +// { +// constexpr size_t bytes_per_element = sizeof(g1::affine_element); +// size_t num_elements = elements_size / bytes_per_element; - if (is_little_endian()) { - for (size_t i = 0; i < num_elements; ++i) { - elements[i].x.data[0] = __builtin_bswap64(elements[i].x.data[0]); - elements[i].x.data[1] = __builtin_bswap64(elements[i].x.data[1]); - elements[i].x.data[2] = __builtin_bswap64(elements[i].x.data[2]); - elements[i].x.data[3] = __builtin_bswap64(elements[i].x.data[3]); - elements[i].y.data[0] = __builtin_bswap64(elements[i].y.data[0]); - elements[i].y.data[1] = __builtin_bswap64(elements[i].y.data[1]); - elements[i].y.data[2] = __builtin_bswap64(elements[i].y.data[2]); - elements[i].y.data[3] = __builtin_bswap64(elements[i].y.data[3]); - elements[i].x.self_to_montgomery_form(); - elements[i].y.self_to_montgomery_form(); - } - } -} +// if (is_little_endian()) { +// for (size_t i = 0; i < num_elements; ++i) { +// elements[i].x.data[0] = __builtin_bswap64(elements[i].x.data[0]); +// elements[i].x.data[1] = __builtin_bswap64(elements[i].x.data[1]); +// elements[i].x.data[2] = __builtin_bswap64(elements[i].x.data[2]); +// elements[i].x.data[3] = __builtin_bswap64(elements[i].x.data[3]); +// elements[i].y.data[0] = __builtin_bswap64(elements[i].y.data[0]); +// elements[i].y.data[1] = __builtin_bswap64(elements[i].y.data[1]); +// elements[i].y.data[2] = __builtin_bswap64(elements[i].y.data[2]); +// elements[i].y.data[3] = __builtin_bswap64(elements[i].y.data[3]); +// elements[i].x.self_to_montgomery_form(); +// elements[i].y.self_to_montgomery_form(); +// } +// } +// } void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size) { memcpy((void*)elements, (void*)buffer, buffer_size); - byteswap(elements, buffer_size); + barretenberg::io::byteswap(elements, buffer_size); } // void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size) @@ -486,7 +487,7 @@ void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::strin // We must pass the size actually read to the second call, not the desired // g1_buffer_size as the file may have been smaller than this. read_file_into_buffer(buffer, size, path, offset, g1_buffer_size); - byteswap(&monomials[num_read], size); + barretenberg::io::byteswap(&monomials[num_read], size); num_read += num_to_read; path = get_transcript_path(dir, ++num); @@ -671,3 +672,17 @@ void write_transcript(g1::affine_element const* g1_x, } // namespace io } // namespace grumpkin + +// WORKTODO: hack +namespace barretenberg { +namespace io { +template void read_transcript_g1(barretenberg::g1::affine_element* monomials, + size_t degree, + std::string const& dir); +template void byteswap(barretenberg::g1::affine_element* elements, size_t buffer_size); +template void read_transcript_g1(grumpkin::g1::affine_element* monomials, + size_t degree, + std::string const& dir); +template void byteswap(grumpkin::g1::affine_element* elements, size_t buffer_size); +} // namespace io +} // namespace barretenberg diff --git a/cpp/src/barretenberg/srs/io.hpp b/cpp/src/barretenberg/srs/io.hpp index f08a53f9c4..5af499ed09 100644 --- a/cpp/src/barretenberg/srs/io.hpp +++ b/cpp/src/barretenberg/srs/io.hpp @@ -1,6 +1,5 @@ #pragma once -#include "../ecc/curves/bn254/g1.hpp" -#include "../ecc/curves/bn254/g2.hpp" +#include "../ecc/curves/bn254/bn254.hpp" #include "../ecc/curves/grumpkin/grumpkin.hpp" #include #include @@ -18,14 +17,16 @@ struct Manifest { uint32_t start_from; }; -void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir); +template +void read_transcript_g1(typename Curve::AffineElement* monomials, size_t degree, std::string const& dir); void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir); void read_transcript(g1::affine_element* monomials, g2::affine_element& g2_x, size_t degree, std::string const& path); void read_g1_elements_from_buffer(g1::affine_element* elements, char const* buffer, size_t buffer_size); -void byteswap(g1::affine_element* elements, size_t buffer_size); + +template void byteswap(typename Curve::AffineElement* elements, size_t buffer_size); void read_g2_elements_from_buffer(g2::affine_element* elements, char const* buffer, size_t buffer_size); void byteswap(g2::affine_element* elements, size_t buffer_size); @@ -61,7 +62,7 @@ struct Manifest { std::string get_transcript_path(std::string const& dir, size_t num); -void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir); +// void read_transcript_g1(g1::affine_element* monomials, size_t degree, std::string const& dir); // void read_transcript_g2(g2::affine_element& g2_x, std::string const& dir); @@ -88,3 +89,17 @@ void write_transcript(g1::affine_element const* g1_x, } // namespace io } // namespace grumpkin + +// WORKTODO: hack +namespace barretenberg { +namespace io { +extern template void read_transcript_g1(barretenberg::g1::affine_element* monomials, + size_t degree, + std::string const& dir); +extern template void byteswap(barretenberg::g1::affine_element* elements, size_t buffer_size); +extern template void read_transcript_g1(grumpkin::g1::affine_element* monomials, + size_t degree, + std::string const& dir); +extern template void byteswap(grumpkin::g1::affine_element* elements, size_t buffer_size); +} // namespace io +} // namespace barretenberg diff --git a/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp index cceb6eef82..a53784be2b 100644 --- a/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp +++ b/cpp/src/barretenberg/srs/reference_string/env_reference_string.hpp @@ -11,7 +11,7 @@ #include "barretenberg/ecc/curves/bn254/g1.hpp" #include "barretenberg/ecc/curves/bn254/g2.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" #include "barretenberg/env/crs.hpp" @@ -32,7 +32,7 @@ class EnvReferenceString : public ProverReferenceString { private: size_t num_points; - scalar_multiplication::Pippenger pippenger_; + scalar_multiplication::Pippenger pippenger_; }; class EnvReferenceStringFactory : public ReferenceStringFactory { diff --git a/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp index ef6bebfc0a..37622675e9 100644 --- a/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp +++ b/cpp/src/barretenberg/srs/reference_string/file_reference_string.hpp @@ -6,7 +6,7 @@ #include "barretenberg/ecc/curves/bn254/g1.hpp" #include "barretenberg/ecc/curves/bn254/g2.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" #include #include @@ -46,7 +46,7 @@ class FileReferenceString : public ProverReferenceString { private: size_t num_points; - scalar_multiplication::Pippenger pippenger_; + scalar_multiplication::Pippenger pippenger_; }; class FileReferenceStringFactory : public ReferenceStringFactory { diff --git a/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp index 608446fa77..50cbacbcac 100644 --- a/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp +++ b/cpp/src/barretenberg/srs/reference_string/mem_reference_string.hpp @@ -5,7 +5,7 @@ #include "reference_string.hpp" -#include "barretenberg/ecc/curves/bn254/scalar_multiplication/pippenger.hpp" +#include "barretenberg/ecc/curves/scalar_multiplication/pippenger.hpp" // WORKTODO: needed? namespace barretenberg::pairing { struct miller_lines; diff --git a/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp b/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp index 53c6867018..6bea00b53f 100644 --- a/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp +++ b/cpp/src/barretenberg/srs/reference_string/pippenger_reference_string.hpp @@ -15,7 +15,7 @@ using namespace barretenberg; class PippengerReferenceString : public ProverReferenceString { public: - PippengerReferenceString(scalar_multiplication::Pippenger* pippenger) + PippengerReferenceString(scalar_multiplication::Pippenger* pippenger) : pippenger_(pippenger) {} @@ -23,12 +23,12 @@ class PippengerReferenceString : public ProverReferenceString { g1::affine_element* get_monomial_points() override { return pippenger_->get_point_table(); } private: - scalar_multiplication::Pippenger* pippenger_; + scalar_multiplication::Pippenger* pippenger_; }; class PippengerReferenceStringFactory : public ReferenceStringFactory { public: - PippengerReferenceStringFactory(scalar_multiplication::Pippenger* pippenger, uint8_t const* g2x) + PippengerReferenceStringFactory(scalar_multiplication::Pippenger* pippenger, uint8_t const* g2x) : pippenger_(pippenger) , g2x_(g2x) {} @@ -47,7 +47,7 @@ class PippengerReferenceStringFactory : public ReferenceStringFactory { } private: - scalar_multiplication::Pippenger* pippenger_; + scalar_multiplication::Pippenger* pippenger_; uint8_t const* g2x_; };