From 05a61024fb9ff7f82c130e4fbf915555c1c18a1a Mon Sep 17 00:00:00 2001 From: Zachary James Williamson <blorktronics@gmail.com> Date: Wed, 1 Mar 2023 18:59:37 +0000 Subject: [PATCH] implemented keccak hash in stdlib (#114) * implemented keccak hash in stdlib * making some minor naming fixes after rebase --------- Co-authored-by: ledwards2225 <l.edwards.d@gmail.com> --- cpp/src/aztec/common/constexpr_utils.hpp | 189 ++++++ .../plookup_tables/keccak/keccak_chi.hpp | 253 +++++++ .../plookup_tables/keccak/keccak_input.hpp | 144 ++++ .../plookup_tables/keccak/keccak_output.hpp | 175 +++++ .../plookup_tables/keccak/keccak_rho.hpp | 296 +++++++++ .../plookup_tables/keccak/keccak_theta.hpp | 255 +++++++ .../plookup_tables/plookup_tables.cpp | 14 + .../plookup_tables/plookup_tables.hpp | 42 ++ .../plonk/composer/plookup_tables/types.hpp | 21 +- .../aztec/plonk/composer/ultra_composer.cpp | 14 + .../aztec/plonk/composer/ultra_composer.hpp | 29 +- cpp/src/aztec/stdlib/hash/CMakeLists.txt | 3 +- .../aztec/stdlib/hash/keccak/CMakeLists.txt | 1 + cpp/src/aztec/stdlib/hash/keccak/keccak.cpp | 620 ++++++++++++++++++ cpp/src/aztec/stdlib/hash/keccak/keccak.hpp | 189 ++++++ .../aztec/stdlib/hash/keccak/keccak.test.cpp | 221 +++++++ 16 files changed, 2462 insertions(+), 4 deletions(-) create mode 100644 cpp/src/aztec/common/constexpr_utils.hpp create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp create mode 100644 cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.cpp create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.hpp create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp diff --git a/cpp/src/aztec/common/constexpr_utils.hpp b/cpp/src/aztec/common/constexpr_utils.hpp new file mode 100644 index 0000000000..08afbcd200 --- /dev/null +++ b/cpp/src/aztec/common/constexpr_utils.hpp @@ -0,0 +1,189 @@ +#pragma once + +#include <stddef.h> +#include <utility> +#include <tuple> + +/** + * @brief constexpr_utils defines some helper methods that perform some stl-equivalent operations + * but in a constexpr context over quantities known at compile-time + * + * Current methods are: + * + * constexpr_for : loop over a range , where the size_t iterator `i` is a constexpr variable + * constexpr_find : find if an element is in an array + * concatenate_arrays : smoosh multiple std::array objects into a single std::array + * + */ +namespace barretenberg { + +/** + * @brief Implements a loop using a compile-time iterator. Requires c++20. + * Implementation (and description) from https://artificial-mind.net/blog/2020/10/31/constexpr-for + * + * @tparam Start the loop start value + * @tparam End the loop end value + * @tparam Inc how much the iterator increases by per iteration + * @tparam F a Lambda function that is executed once per loop + * + * @param f An rvalue reference to the lambda + * @details Implements a `for` loop where the iterator is a constexpr variable. + * Use this when you need to evaluate `if constexpr` statements on the iterator (or apply other constexpr expressions) + * Outside of this use-case avoid using this fn as it gives negligible performance increases vs regular loops. + * + * N.B. A side-effect of this method is that all loops will be unrolled + * (each loop iteration uses different iterator template parameters => unique constexpr_for implementation per + * iteration) + * Do not use this for large (~100+) loops! + * + * ############################## + * EXAMPLE USE OF `constexpr_for` + * ############################## + * + * constexpr_for<0, 10, 1>([&]<size_t i>(){ + * if constexpr (i & 1 == 0) + * { + * foo[i] = even_container[i >> 1]; + * } + * else + * { + * foo[i] = odd_container[i >> 1]; + * } + * }); + * + * In the above example we are iterating from i = 0 to i < 10. + * The provided lambda function has captured everything in its surrounding scope (via `[&]`), + * which is where `foo`, `even_container` and `odd_container` have come from. + * + * We do not need to explicitly define the `class F` parameter as the compiler derives it from our provided input + * argument `F&& f` (i.e. the lambda function) + * + * In the loop itself we're evaluating a constexpr if statement that defines which code path is taken. + * + * The above example benefits from `constexpr_for` because a run-time `if` statement has been reduced to a compile-time + * `if` statement. N.B. this would only give measurable improvements if the `constexpr_for` statement is itself in a hot + * loop that's iterated over many (>thousands) times + */ +template <size_t Start, size_t End, size_t Inc, class F> constexpr void constexpr_for(F&& f) +{ + // Call function `f<Start>()` iff Start < End + if constexpr (Start < End) { + // F must be a template lambda with a single **typed** template parameter that represents the iterator + // (e.g. [&]<size_t i>(){ ... } is good) + // (and [&]<typename i>(){ ... } won't compile!) + + /** + * Explaining f.template operator()<Start>() + * + * The following line must explicitly tell the compiler that <Start> is a template parameter by using the + * `template` keyword. + * (if we wrote f<Start>(), the compiler could legitimately interpret `<` as a less than symbol) + * + * The fragment `f.template` tells the compiler that we're calling a *templated* member of `f`. + * The "member" being called is the function operator, `operator()`, which must be explicitly provided + * (for any function X, `X(args)` is an alias for `X.operator()(args)`) + * The compiler has no alias `X.template <tparam>(args)` for `X.template operator()<tparam>(args)` so we must + * write it explicitly here + * + * To summarise what the next line tells the compiler... + * 1. I want to call a member of `f` that expects one or more template parameters + * 2. The member of `f` that I want to call is the function operator + * 3. The template parameter is `Start` + * 4. The funtion operator itself contains no arguments + */ + f.template operator()<Start>(); + + // Once we have executed `f`, we recursively call the `constexpr_for` function, increasing the value of `Start` + // by `Inc` + constexpr_for<Start + Inc, End, Inc>(f); + } +} + +/** + * @brief returns true/false depending on whether `key` is in `container` + * + * @tparam container i.e. what are we looking in? + * @tparam key i.e. what are we looking for? + * @return true found! + * @return false not found! + * + * @details method is constexpr and can be used in static_asserts + */ +template <const auto& container, auto key> constexpr bool constexpr_find() +{ + // using ElementType = typename std::remove_extent<ContainerType>::type; + bool found = false; + constexpr_for<0, container.size(), 1>([&]<size_t k>() { + if constexpr (std::get<k>(container) == key) { + found = true; + } + }); + return found; +} + +/** + * @brief merges multiple std::arrays into a single array. + * Array lengths can be different but array type must match + * Method is constexpr and should concat constexpr arrays at compile time + * + * @tparam Type the array type + * @tparam sizes template parameter pack of size_t value params + * @param arrays + * @return constexpr auto + * + * @details template params should be autodeducted. Example use case: + * + * ``` + * std::array<int, 2> a{1, 2}; + * std::array<int, 3> b{1,3, 5}; + * std::array<int, 5> c = concatenate(a, b); + * ``` + */ +template <typename Type, std::size_t... sizes> +constexpr auto concatenate_arrays(const std::array<Type, sizes>&... arrays) +{ + return std::apply([](auto... elems) -> std::array<Type, (sizes + ...)> { return { { elems... } }; }, + std::tuple_cat(std::tuple_cat(arrays)...)); +} + +/** + * @brief Create a constexpr array object whose elements contain a default value + * + * @tparam T type contained in the array + * @tparam Is index sequence + * @param value the value each array element is being initialized to + * @return constexpr std::array<T, sizeof...(Is)> + * + * @details This method is used to create constexpr arrays whose encapsulated type: + * + * 1. HAS NO CONSTEXPR DEFAULT CONSTRUCTOR + * 2. HAS A CONSTEXPR COPY CONSTRUCTOR + * + * An example of this is barretenberg::field_t + * (the default constructor does not default assign values to the field_t member variables for efficiency reasons, to + * reduce the time require to construct large arrays of field elements. This means the default constructor for field_t + * cannot be constexpr) + */ +template <typename T, std::size_t... Is> +constexpr std::array<T, sizeof...(Is)> create_array(T value, std::index_sequence<Is...>) +{ + // cast Is to void to remove the warning: unused value + std::array<T, sizeof...(Is)> result = { { (static_cast<void>(Is), value)... } }; + return result; +} + +/** + * @brief Create a constexpr array object whose values all are 0 + * + * @tparam T + * @tparam N + * @return constexpr std::array<T, N> + * + * @details Use in the same context as create_array, i.e. when encapsulated type has a default constructor that is not + * constexpr + */ +template <typename T, size_t N> constexpr std::array<T, N> create_empty_array() +{ + return create_array(T(0), std::make_index_sequence<N>()); +} +}; // namespace barretenberg \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp new file mode 100644 index 0000000000..e7245a665d --- /dev/null +++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp @@ -0,0 +1,253 @@ +#pragma once + +#include <numeric/bitop/pow.hpp> +#include <common/constexpr_utils.hpp> +#include "../types.hpp" + +namespace plookup { +namespace keccak_tables { + +/** + * @brief Generates plookup tables required for CHI round of Keccak hash function + * + * Keccak has 25 hash lanes, each represented as 64-bit integers. The CHI round performs the following operation on 3 + * hash lanes: + * + * A ^ (~B & C) + * + * We evaluate in-circuit using a base-11 sparse integer representation: + * + * P = \sum_{j=0}^63 b_i * 11^i + * + * In this representation we evaluate CHI via the linear expression + * + * 2.A - B + C + Q + * + * Where Q is the precomputed constant \sum_{i=0}^63 11^i + * + * This can be represented via the 'truth table' for each base-11 quasi-bit: + * + * | A | B | C | Algebraic Output | + * | - | - | - | ---------------- | + * | 0 | 0 | 0 | 1 | + * | 0 | 0 | 1 | 2 | + * | 0 | 1 | 0 | 0 | + * | 1 | 0 | 0 | 3 | + * | 1 | 0 | 1 | 4 | + * | 1 | 1 | 0 | 2 | + * | 1 | 1 | 1 | 3 | + * + * CHI round uses a plookup table that normalizes the algebraic output back into the binary output. + * + * | Algebraic Output | Binary Output | + * | ---------------- | ------------- | + * | 0 | 0 | + * | 1 | 0 | + * | 2 | 1 | + * | 3 | 1 | + * | 4 | 0 | + * + * In addition we also use the CHI lookup table to determine the value of the most significant (63rd) bit of the output + * + * for all M in [0, ..., TABLE_BITS - 1] and K in [0, 1, 2, 3, 4], the column values of our lookup table are: + * + * Column1 value = \sum_{i \in M} \sum_{j \in K} 11^i * j] + * Column2 value = \sum_{i \in M} \sum_{j \in K} 11^i * CHI_NORMALIZATION_TABLE[j]] + * Column3 value = Column2 / 11^8 + * + */ +class Chi { + public: + // 1 + 2a - b + c => a xor (~b & c) + static constexpr uint64_t CHI_NORMALIZATION_TABLE[5]{ + 0, 0, 1, 1, 0, + }; + + static constexpr uint64_t BASE = 11; + + // effective base = maximum value each input 'quasi-bit' can reach at this stage of the Keccak algo + // (we use base11 as it's a bit more efficient to use a consistent base across the entire Keccak hash algorithm. + // The THETA round requires base-11 in order to most efficiently convert XOR operations into algebraic operations) + static constexpr uint64_t EFFECTIVE_BASE = 5; + + // TABLE_BITS defines table size. More bits = fewer lookups but larger tables! + static constexpr uint64_t TABLE_BITS = 6; + + /** + * @brief Given a table input value, return the table output value + * + * Used by the Plookup code to precompute lookup tables and generate witness values + * + * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value) + * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^8) + */ + static std::array<barretenberg::fr, 2> get_chi_renormalization_values(const std::array<uint64_t, 2> key) + { + uint64_t accumulator = 0; + uint64_t input = key[0]; + uint64_t base_shift = 1; + constexpr uint64_t divisor_exponent = (64 % TABLE_BITS == 0) ? (TABLE_BITS - 1) : (64 % TABLE_BITS) - 1; + constexpr uint64_t divisor = numeric::pow64(BASE, divisor_exponent); + while (input > 0) { + uint64_t slice = input % BASE; + uint64_t bit = CHI_NORMALIZATION_TABLE[static_cast<size_t>(slice)]; + accumulator += (bit * base_shift); + input /= BASE; + base_shift *= BASE; + } + + return { barretenberg::fr(accumulator), barretenberg::fr(accumulator / divisor) }; + } + + /** + * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1]) + * Code is slightly faster at runtime if we compute this at compile time + * + * @return constexpr std::array<uint64_t, TABLE_BITS> + */ + static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases() + { + std::array<uint64_t, TABLE_BITS> result; + uint64_t acc = 1; + for (size_t i = 0; i < TABLE_BITS; ++i) { + result[i] = acc; + acc *= BASE; + } + return result; + } + + /** + * @brief Get column values for next row of plookup table. Used to generate plookup table row values + * + * Input `counts` is an array of quasi-bits that represent the current row. + * Method increases `counts` by 1 and returns the plookup table column values. + * + * (a bit tricky to compute because each quasi-bit ranges from [0, 1, 2, 3, 4], + * but we're working with base-11 numbers. + * i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!) + * + * @param counts The current row value represented as an array of quasi-bits + * @return std::array<uint64_t, 3> the columns of the plookup table + */ + static std::array<uint64_t, 3> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts) + { + static constexpr auto scaled_bases = get_scaled_bases(); + + // want the most significant bit of the 64-bit integer, when this table is used on the most significant slice + constexpr uint64_t divisor_exponent = (64 % TABLE_BITS == 0) ? (TABLE_BITS - 1) : (64 % TABLE_BITS) - 1; + constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), divisor_exponent); + + for (size_t i = 0; i < TABLE_BITS; ++i) { + if (counts[i] == EFFECTIVE_BASE - 1) { + counts[i] = 0; + } else { + counts[i] += 1; + break; + } + } + + uint64_t value = 0; + uint64_t normalized_value = 0; + for (size_t i = 0; i < TABLE_BITS; ++i) { + value += counts[i] * scaled_bases[i]; + normalized_value += (CHI_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i]; + } + return { value, normalized_value, normalized_value / divisor }; + } + + /** + * @brief Generate the CHI plookup table + * + * This table is used by Composer objects to generate plookup constraints + * + * @param id a compile-time ID defined via plookup_tables.hpp + * @param table_index a circuit-specific ID for the table used by the circuit Composer + * @return BasicTable + */ + static BasicTable generate_chi_renormalization_table(BasicTableId id, const size_t table_index) + { + BasicTable table; + table.id = id; + table.table_index = table_index; + table.use_twin_keys = false; + table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS); + + std::array<size_t, TABLE_BITS> counts{}; + std::array<uint64_t, 3> column_values{ 0, 0, 0 }; + for (size_t i = 0; i < table.size; ++i) { + table.column_1.emplace_back(column_values[0]); + table.column_2.emplace_back(column_values[1]); + table.column_3.emplace_back(column_values[2]); + column_values = get_column_values_for_next_row(counts); + } + + table.get_values_from_key = &get_chi_renormalization_values; + + constexpr uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS); + table.column_1_step_size = barretenberg::fr(step_size); + table.column_2_step_size = barretenberg::fr(step_size); + table.column_3_step_size = barretenberg::fr(0); + return table; + } + + /** + * @brief Create the CHI MultiTable used by plookup to generate a sequence of lookups + * + * The CHI round operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits. + * + * i.e. multiple lookups are required for a single 64-bit integer. + * + * If we group these lookups together, we can derive the plookup column values + * from the relative difference between wire values. + * + * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the + * output slices + * + * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where + * A = \sum_{i=0}^7 A_i * 11^8 + * B = \sum_{i=0}^7 B_i * 11^8 + * C_i = B_i / 11^8 (to get the most significant bit of B) + * + * Our plookup gates will produce a gates with the following wire values: + * + * | W1 | W2 | W3 | + * | ----------------------- | ----------------------- | --- | + * | \sum_{i=0}^7 A_i * 11^i | \sum_{i=0}^7 B_i * 11^i | C_0 | + * | \sum_{i=1}^7 A_i * 11^i | \sum_{i=1}^7 B_i * 11^i | C_1 | + * | \sum_{i=2}^7 A_i * 11^i | \sum_{i=2}^7 B_i * 11^i | C_2 | + * | ... | ... | ... | + * | A^7 | B^7 | C^7 | + * + * The plookup protocol extracts the 1st and 2nd lookup column values by taking: + * + * Colunn1 = W1[i] - 11^8 . W1[i + 1] + * Colunn2 = W2[i] - 11^8 . W2[i + 1] + * + * (where the -11^8 coefficient is stored in a precomputed selector polynomial) + * + * This MultiTable construction defines the value of these precomputed selector polynomial values, + * as well as defines how the column values are derived from a starting input value. + * + * @param id + * @return MultiTable + */ + static MultiTable get_chi_output_table(const MultiTableId id = KECCAK_CHI_OUTPUT) + { + constexpr size_t num_tables_per_multitable = + (64 / TABLE_BITS) + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 8 bits per entry + + // column_multiplier is used to define the gap between rows when deriving colunn values via relative differences + uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS); + MultiTable table(column_multiplier, column_multiplier, 0, num_tables_per_multitable); + + table.id = id; + for (size_t i = 0; i < num_tables_per_multitable; ++i) { + table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS)); + table.lookup_ids.emplace_back(KECCAK_CHI); + table.get_table_values.emplace_back(&get_chi_renormalization_values); + } + return table; + } +}; +} // namespace keccak_tables +} // namespace plookup \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp new file mode 100644 index 0000000000..5da5c79074 --- /dev/null +++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp @@ -0,0 +1,144 @@ +#pragma once + +#include <numeric/bitop/pow.hpp> +#include <numeric/bitop/sparse_form.hpp> +#include <common/constexpr_utils.hpp> +#include "../types.hpp" + +namespace plookup { +namespace keccak_tables { + +/** + * @brief Generates plookup tables used convert 64-bit integers into a sparse representation used for Keccak hash + * algorithm + * + * Keccak has 25 hash lanes, each represented as 64-bit integers. + * + * We evaluate in-circuit using a base-11 sparse integer representation for each lane: + * + * P = \sum_{j=0}^63 b_i * 11^i + * + * KeccakInput defines the plookup table that maps binary integer slices into base-11 integer slices. + * + * In addition, KeccakInput also is used to determine the value of the most significant (63rd) bit of the input + * (which is used by stdlib::keccak to more efficiently left-rotate by 1 bit) + */ +class KeccakInput { + + public: + static constexpr uint64_t BASE = 11; + static constexpr size_t TABLE_BITS = 8; + + /** + * @brief Given a table input value, return the table output value + * + * Used by the Plookup code to precompute lookup tables and generate witness values + * + * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value) + * @return std::array<barretenberg::fr, 2> table output + */ + static std::array<barretenberg::fr, 2> get_keccak_input_values(const std::array<uint64_t, 2> key) + { + const uint256_t t0 = numeric::map_into_sparse_form<BASE>(key[0]); + + constexpr size_t msb_shift = (64 % TABLE_BITS == 0) ? TABLE_BITS - 1 : (64 % TABLE_BITS) - 1; + const uint256_t t1 = key[0] >> msb_shift; + return { barretenberg::fr(t0), barretenberg::fr(t1) }; + } + + /** + * @brief Generate plookup table that maps a TABLE_BITS-slice of a base-2 integer into a base-11 representation + * + * @param id + * @param table_index + * @return BasicTable + */ + static BasicTable generate_keccak_input_table(BasicTableId id, const size_t table_index) + { + BasicTable table; + table.id = id; + table.table_index = table_index; + table.size = (1U << TABLE_BITS); + table.use_twin_keys = false; + constexpr size_t msb_shift = (64 % TABLE_BITS == 0) ? TABLE_BITS - 1 : (64 % TABLE_BITS) - 1; + + for (uint64_t i = 0; i < table.size; ++i) { + const uint64_t source = i; + const auto target = numeric::map_into_sparse_form<BASE>(source); + table.column_1.emplace_back(barretenberg::fr(source)); + table.column_2.emplace_back(barretenberg::fr(target)); + table.column_3.emplace_back(barretenberg::fr(source >> msb_shift)); + } + + table.get_values_from_key = &get_keccak_input_values; + + uint256_t sparse_step_size = 1; + for (size_t i = 0; i < TABLE_BITS; ++i) { + sparse_step_size *= BASE; + } + table.column_1_step_size = barretenberg::fr((1 << TABLE_BITS)); + table.column_2_step_size = barretenberg::fr(sparse_step_size); + table.column_3_step_size = barretenberg::fr(sparse_step_size); + + return table; + } + + /** + * @brief Create the KeccakInput MultiTable used by plookup to generate a sequence of lookups + * + * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits. + * + * i.e. multiple lookups are required for a single 64-bit integer. + * + * If we group these lookups together, we can derive the plookup column values + * from the relative difference between wire values. + * + * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the + * output slices + * + * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where + * A = \sum_{i=0}^7 A_i * 2^8 + * B = \sum_{i=0}^7 B_i * 11^8 + * C_i = B_i >> 7 (to get the most significant bit of B) + * + * Our plookup gates will produce a gates with the following wire values: + * + * | W1 | W2 | W3 | + * | ----------------------- | ----------------------- | --- | + * | \sum_{i=0}^7 A_i * 2^i | \sum_{i=0}^7 B_i * 11^i | C_0 | + * | \sum_{i=1}^7 A_i * 2^i | \sum_{i=1}^7 B_i * 11^i | C_1 | + * | \sum_{i=2}^7 A_i * 2^i | \sum_{i=2}^7 B_i * 11^i | C_2 | + * | ... | ... | ... | + * | A^7 | B^7 | C^7 | + * + * The plookup protocol extracts the 1st and 2nd lookup column values by taking: + * + * Colunn1 = W1[i] - 2^8 . W1[i + 1] + * Colunn2 = W2[i] - 11^8 . W2[i + 1] + * + * (where the -11^8 coefficient is stored in a precomputed selector polynomial) + * + * This MultiTable construction defines the value of these precomputed selector polynomial values, + * as well as defines how the column values are derived from a starting input value. + * + * @param id + * @return MultiTable + */ + static MultiTable get_keccak_input_table(const MultiTableId id = KECCAK_FORMAT_INPUT) + { + const size_t num_entries = 8; + + MultiTable table(1 << 8, uint256_t(11).pow(8), 0, num_entries); + + table.id = id; + for (size_t i = 0; i < num_entries; ++i) { + table.slice_sizes.emplace_back(1 << 8); + table.lookup_ids.emplace_back(KECCAK_INPUT); + table.get_table_values.emplace_back(&get_keccak_input_values); + } + return table; + } +}; + +} // namespace keccak_tables +} // namespace plookup \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp new file mode 100644 index 0000000000..4f5fcdd0f7 --- /dev/null +++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp @@ -0,0 +1,175 @@ +#pragma once + +#include <numeric/bitop/pow.hpp> +#include <numeric/bitop/sparse_form.hpp> +#include <common/constexpr_utils.hpp> + +#include "../types.hpp" +#include "../sparse.hpp" + +namespace plookup { +namespace keccak_tables { + +/** + * @brief Converts a base-11 sparse integer representation into a regular base-2 binary integer. + * Used by the Keccak hash algorithm to convert the output of the algorithm into a regular integer. + */ +class KeccakOutput { + + public: + static constexpr uint64_t BASE = 11; + + // effective base = maximum value each input 'quasi-bit' can reach + // (the input is in base-11 representation, but at this point in the Keccak algorithm each 'quasi-bit' can only + // take values [0, 1] not [0, ..., 10] + static constexpr uint64_t EFFECTIVE_BASE = 2; + static constexpr size_t TABLE_BITS = 8; + + static constexpr uint64_t OUTPUT_NORMALIZATION_TABLE[2]{ 0, 1 }; + + /** + * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1]) + * Code is slightly faster at runtime if we compute this at compile time + * + * @return constexpr std::array<uint64_t, TABLE_BITS> + */ + static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases() + { + std::array<uint64_t, TABLE_BITS> result; + uint64_t acc = 1; + for (size_t i = 0; i < TABLE_BITS; ++i) { + result[i] = acc; + acc *= BASE; + } + return result; + } + + /** + * @brief Get column values for next row of plookup table. Used to generate plookup table row values + * + * Input `counts` is an array of quasi-bits that represent the current row. + * Method increases `counts` by 1 and returns the plookup table column values. + * + * (a bit tricky to compute because each quasi-bit ranges from [0, 1], + * but we're working with base-11 numbers. + * i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!) + * + * @param counts The current row value represented as an array of quasi-bits + * @return std::array<uint64_t, uint64_t> first and second columns of plookup table (3rd column is 0) + */ + static std::array<uint64_t, 2> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts) + { + static constexpr auto scaled_bases = get_scaled_bases(); + + for (size_t i = 0; i < TABLE_BITS; ++i) { + if (counts[i] == EFFECTIVE_BASE - 1) { + counts[i] = 0; + } else { + counts[i] += 1; + break; + } + } + + uint64_t value = 0; + uint64_t normalized_value = 0; + for (size_t i = 0; i < TABLE_BITS; ++i) { + value += counts[i] * scaled_bases[i]; + normalized_value += (OUTPUT_NORMALIZATION_TABLE[counts[i]]) << i; + } + return { value, normalized_value }; + } + + /** + * @brief Generate plookup table that maps a TABLE_BITS-slice of a base-11 integer into a base-2 integer + * + * @param id + * @param table_index + * @return BasicTable + */ + static BasicTable generate_keccak_output_table(BasicTableId id, const size_t table_index) + { + BasicTable table; + table.id = id; + table.table_index = table_index; + table.use_twin_keys = false; + table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS); + + std::array<size_t, TABLE_BITS> counts{}; + std::array<uint64_t, 2> column_values{ 0, 0 }; + + for (size_t i = 0; i < table.size; ++i) { + table.column_1.emplace_back(column_values[0]); + table.column_2.emplace_back(column_values[1]); + table.column_3.emplace_back(0); + column_values = get_column_values_for_next_row(counts); + } + + table.get_values_from_key = &sparse_tables::get_sparse_normalization_values<BASE, OUTPUT_NORMALIZATION_TABLE>; + + table.column_1_step_size = barretenberg::fr(numeric::pow64(static_cast<size_t>(BASE), TABLE_BITS)); + table.column_2_step_size = barretenberg::fr(((uint64_t)1 << TABLE_BITS)); + table.column_3_step_size = 0; + return table; + } + + /** + * @brief Create the KeccakOutput MultiTable used by plookup to generate a sequence of lookups + * + * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits. + * + * i.e. multiple lookups are required for a single 64-bit integer. + * + * If we group these lookups together, we can derive the plookup column values + * from the relative difference between wire values. + * + * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the + * output slices + * + * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B where + * A = \sum_{i=0}^7 A_i * 11^8 + * B = \sum_{i=0}^7 B_i * 2^8 + * + * Our plookup gates will produce a gates with the following wire values: + * + * | W1 | W2 | W3 | + * | ----------------------- | ----------------------- | --- | + * | \sum_{i=0}^7 A_i * 2^i | \sum_{i=0}^7 B_i * 11^i | 0 | + * | \sum_{i=1}^7 A_i * 2^i | \sum_{i=1}^7 B_i * 11^i | 0 | + * | \sum_{i=2}^7 A_i * 2^i | \sum_{i=2}^7 B_i * 11^i | 0 | + * | ... | ... | ... | + * | A^7 | B^7 | 0 | + * + * The plookup protocol extracts the 1st and 2nd lookup column values by taking: + * + * Colunn1 = W1[i] - 11^8 . W1[i + 1] + * Colunn2 = W2[i] - 2^8 . W2[i + 1] + * + * (where the -11^8, -2^8 coefficients are stored in a precomputed selector polynomial) + * + * This MultiTable construction defines the value of these precomputed selector polynomial values, + * as well as defines how the column values are derived from a starting input value. + * + * @param id + * @return MultiTable + */ + static MultiTable get_keccak_output_table(const MultiTableId id = KECCAK_FORMAT_OUTPUT) + { + constexpr size_t num_tables_per_multitable = + 64 / TABLE_BITS + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 8 bits per entry + + uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS); + MultiTable table(column_multiplier, (1ULL << TABLE_BITS), 0, num_tables_per_multitable); + + table.id = id; + for (size_t i = 0; i < num_tables_per_multitable; ++i) { + table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS)); + table.lookup_ids.emplace_back(KECCAK_OUTPUT); + table.get_table_values.emplace_back( + &sparse_tables::get_sparse_normalization_values<BASE, OUTPUT_NORMALIZATION_TABLE>); + } + return table; + } +}; + +} // namespace keccak_tables +} // namespace plookup \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp new file mode 100644 index 0000000000..e7bd5ad223 --- /dev/null +++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp @@ -0,0 +1,296 @@ +#pragma once + +#include <numeric/bitop/pow.hpp> +#include <common/constexpr_utils.hpp> +#include "../types.hpp" + +namespace plookup { +namespace keccak_tables { + +/** + * @brief Generate the plookup tables used for the RHO round of the Keccak hash algorithm + * + * Keccak has 25 hash lanes, each represented as 64-bit integers. + * The RHO round performs a left-rotation on each lane, by a fixed rotation value defined by the ROTATIONS arrray + * + * We evaluate Keccak in-circuit using a base-11 sparse integer representation of each hash lane: + * + * P = \sum_{j=0}^63 b_i * 11^i + * + * This allows us to replace binary operations (XOR, AND) with algebraic ones, combined with plookup tables. + * (see keccak_chi.hpp for comments on this). + * + * At this point in the algorithm, each hash lane 'quasi-bit' is in the range [0, 1, 2]. + * + * The RHO lookup tables are used to perform the following: + * + * 1. Normalize each quasi-bit so that P_out = \sum_{j=0}^63 (b_i mod 2) * 11^i + * 2. Perform a left-rotation whose value is defined by a value in the ROTATIONS array + * 3. Extract the most significant bit of the non-rotated normalized output + * + * The most significant bit component is required because we use this table to + * normalize XOR operations in the IOTA round and the `sponge_absorb` phase of the algorighm. + * (Both IOTA and `sponge_absorb` are followed by the THETA round which requires the msb of each hash lane) + * + * Rotations are performed by splitting the input into 'left' and 'right' bit slices + * (left slice = bits that wrap around the 11^64 modulus of our base-11 integers) + * (right slice = bits that don't wrap) + * + * Both slices are fed into a Rho plookup table. The outputs are stitched together to produce the rotated value. + * + * We need multiple Rho tables in order to efficiently range-constrain our input slices. + * + * The maximum number of bits we can accomodate in this lookup table is MAXIMUM_MULTITABLE_BITS (assume this is 8) + * For example take a left-rotation by 1 bit. The right-slice will be a 63-bit integer. + * 63 does not evenly divide 8. i.e. an 8-bit table cannot correctly range-constrain the input slice and we would need + * additional range constraints. + * We solve this problem by creating multiple Rho lookup tables each of different sizes (1 bit, 2 bits, ..., 8 bits). + * + * We can stitch together a lookup table sequence that correctly range constrains the left/right slices for any of our + * 25 rotation values + * + * @tparam TABLE_BITS The number of bits each lookup table can accomodate + * @tparam LANE_INDEX Required by get_rho_output_table to produce the correct MultiTable + */ +template <size_t TABLE_BITS = 0, size_t LANE_INDEX = 0> class Rho { + + public: + static constexpr uint64_t BASE = 11; + + // EFFECTIVE_BASE = maximum value each input 'quasi-bit' can reach at this stage in the Keccak algo + // (we use base11 as it's a bit more efficient to use a consistent base across the entire Keccak hash algorithm. + // The THETA round requires base-11 in order to most efficiently convert XOR operations into algebraic operations) + static constexpr uint64_t EFFECTIVE_BASE = 3; + + // The maximum number of bits of a Rho lookup table (TABLE_BITS <= MAXIMUM_MULTITABLE_BITS). + // Used in get_rho_output_table + static constexpr size_t MAXIMUM_MULTITABLE_BITS = 8; + + // Rotation offsets, y vertically, x horizontally: r[y * 5 + x] + static constexpr std::array<size_t, 25> ROTATIONS = { + 0, 1, 62, 28, 27, 36, 44, 6, 55, 20, 3, 10, 43, 25, 39, 41, 45, 15, 21, 8, 18, 2, 61, 56, 14, + }; + + static constexpr uint64_t RHO_NORMALIZATION_TABLE[3]{ + 0, + 1, + 0, + }; + + /** + * @brief Given a table input value, return the table output value + * + * Used by the Plookup code to precompute lookup tables and generate witness values + * + * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value) + * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^TABLE_BITS - 1) + */ + static std::array<barretenberg::fr, 2> get_rho_renormalization_values(const std::array<uint64_t, 2> key) + { + uint64_t accumulator = 0; + uint64_t input = key[0]; + uint64_t base_shift = 1; + constexpr uint64_t divisor_exponent = TABLE_BITS - 1; + constexpr uint64_t divisor = numeric::pow64(BASE, divisor_exponent); + while (input > 0) { + uint64_t slice = input % BASE; + uint64_t bit = RHO_NORMALIZATION_TABLE[static_cast<size_t>(slice)]; + accumulator += (bit * base_shift); + input /= BASE; + base_shift *= BASE; + } + + return { barretenberg::fr(accumulator), barretenberg::fr(accumulator / divisor) }; + } + + /** + * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1]) + * Code is slightly faster at runtime if we compute this at compile time + * + * @return constexpr std::array<uint64_t, TABLE_BITS> + */ + static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases() + { + std::array<uint64_t, TABLE_BITS> result; + uint64_t acc = 1; + for (size_t i = 0; i < TABLE_BITS; ++i) { + result[i] = acc; + acc *= BASE; + } + return result; + } + + /** + * @brief Get column values for next row of plookup table. Used to generate plookup table row values + * + * Input `counts` is an array of quasi-bits that represent the current row. + * Method increases `counts` by 1 and returns the plookup table column values. + * + * (a bit tricky to compute because each quasi-bit ranges from [0, 1, 2], + * but we're working with base-11 numbers. + * i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!) + * + * @param counts The current row value represented as an array of quasi-bits + * @return std::array<uint64_t, 3> the columns of the plookup table + */ + static std::array<uint64_t, 3> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts) + { + static constexpr auto scaled_bases = get_scaled_bases(); + + // want the most significant bit of the 64-bit integer, when this table is used on the most significant slice + constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS - 1); + + for (size_t i = 0; i < TABLE_BITS; ++i) { + if (counts[i] == EFFECTIVE_BASE - 1) { + counts[i] = 0; + } else { + counts[i] += 1; + break; + } + } + + uint64_t value = 0; + uint64_t normalized_value = 0; + for (size_t i = 0; i < TABLE_BITS; ++i) { + value += counts[i] * scaled_bases[i]; + normalized_value += (RHO_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i]; + } + return { value, normalized_value, normalized_value / divisor }; + } + + /** + * @brief Generate plookup table that normalizes a TABLE_BITS-slice of a base-11 integer and extracts the msb + * + * @param id + * @param table_index + * @return BasicTable + */ + static BasicTable generate_rho_renormalization_table(BasicTableId id, const size_t table_index) + { + BasicTable table; + table.id = id; + table.table_index = table_index; + table.use_twin_keys = false; + table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS); + + std::array<size_t, TABLE_BITS> counts{}; + std::array<uint64_t, 3> column_values{ 0, 0, 0 }; + + for (size_t i = 0; i < table.size; ++i) { + table.column_1.emplace_back(column_values[0]); + table.column_2.emplace_back(column_values[1]); + table.column_3.emplace_back(column_values[2]); + column_values = get_column_values_for_next_row(counts); + } + + table.get_values_from_key = &get_rho_renormalization_values; + + uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS); + table.column_1_step_size = barretenberg::fr(step_size); + table.column_2_step_size = barretenberg::fr(step_size); + table.column_3_step_size = barretenberg::fr(0); + return table; + } + + /** + * @brief Create the Rho MultiTable used by plookup to generate a sequence of lookups + * + * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits. + * + * Representing the RHO lookup as a sequence of accumulating sums is tricky because of rotations. + * + * For example, imagine our input is a 32-bit integer A represented as: A = A3.11^24 + A2.11^16 + A1.11^8 + A0, + * and our output is a 32-bit integer B = B3.11^24 + B2.11^16 + B1.11^8 + B0 + * + * In this example, we want to normalize A and left-rotate by 16 bits. + * + * Our lookup gate wire values will look like the following: + * + * | Row | C0 | C1 | C2 | + * | --- | -----------------------------------------| ------------ | -------- | + * | 0 | A3.11^24 + A2.11^16 + A1.11^8 + A0 | B1.11^8 + B0 | A0.msb() | + * | 1 | A3.11^16 + A2.11^8 + A1 | B1 | A1.msb() | + * | 2 | A1311^8 + A2 | B3.11^8 + B2 | A2.msb() | + * | 3 | A3 | B3 | A3.msb() | + * + * Finally, an addition gate is used to extract B = 11^32 * C1[0] + C1[2] + * + * We use the above structure because the plookup protocol derives lookup entries by taking: + * + * Colunn1 = W1[i] + Q1 . W1[i + 1] + * Colunn2 = W2[i] + Q2 . W2[i + 1] + * + * Where Q1, Q2 are constants encoded in selector polynomials. + * We do not have any spare selector polynomials to apply to W1[i] and W2[i] :( + * + * i.e. we cannot represent the column C1 as a sequence of accumulating sums whilst performing a bit rotation! + * The part of A that wraps around past 11^64 must be represented separately vs the part that does not. + * + * @param id + * @return MultiTable + */ + static MultiTable get_rho_output_table(const MultiTableId id = KECCAK_NORMALIZE_AND_ROTATE) + { + constexpr size_t left_bits = ROTATIONS[LANE_INDEX]; + constexpr size_t right_bits = 64 - ROTATIONS[LANE_INDEX]; + constexpr size_t num_left_tables = + left_bits / MAXIMUM_MULTITABLE_BITS + (left_bits % MAXIMUM_MULTITABLE_BITS > 0 ? 1 : 0); + constexpr size_t num_right_tables = + right_bits / MAXIMUM_MULTITABLE_BITS + (right_bits % MAXIMUM_MULTITABLE_BITS > 0 ? 1 : 0); + + MultiTable table; + table.id = id; + + table.column_1_step_sizes.push_back(1); + table.column_2_step_sizes.push_back(1); + table.column_3_step_sizes.push_back(1); + + // generate table selector values for the 'right' slice + barretenberg::constexpr_for<0, num_right_tables, 1>([&]<size_t i> { + constexpr size_t num_bits_processed = (i * MAXIMUM_MULTITABLE_BITS); + constexpr size_t bit_slice = (num_bits_processed + MAXIMUM_MULTITABLE_BITS > right_bits) + ? right_bits % MAXIMUM_MULTITABLE_BITS + : MAXIMUM_MULTITABLE_BITS; + + constexpr uint64_t scaled_base = numeric::pow64(BASE, bit_slice); + if (i == num_right_tables - 1) { + table.column_1_step_sizes.push_back(scaled_base); + table.column_2_step_sizes.push_back(0); + table.column_3_step_sizes.push_back(0); + } else { + table.column_1_step_sizes.push_back(scaled_base); + table.column_2_step_sizes.push_back(scaled_base); + table.column_3_step_sizes.push_back(0); + } + + table.slice_sizes.push_back(scaled_base); + table.get_table_values.emplace_back(&get_rho_renormalization_values); + table.lookup_ids.push_back((BasicTableId)((size_t)KECCAK_RHO_1 + (bit_slice - 1))); + }); + + // generate table selector values for the 'left' slice + barretenberg::constexpr_for<0, num_left_tables, 1>([&]<size_t i> { + constexpr size_t num_bits_processed = (i * MAXIMUM_MULTITABLE_BITS); + + constexpr size_t bit_slice = (num_bits_processed + MAXIMUM_MULTITABLE_BITS > left_bits) + ? left_bits % MAXIMUM_MULTITABLE_BITS + : MAXIMUM_MULTITABLE_BITS; + constexpr uint64_t scaled_base = numeric::pow64(BASE, bit_slice); + + if (i != num_left_tables - 1) { + table.column_1_step_sizes.push_back(scaled_base); + table.column_2_step_sizes.push_back(scaled_base); + table.column_3_step_sizes.push_back(0); + } + + table.slice_sizes.push_back(scaled_base); + table.get_table_values.emplace_back(&get_rho_renormalization_values); + table.lookup_ids.push_back((BasicTableId)((size_t)KECCAK_RHO_1 + (bit_slice - 1))); + }); + + return table; + } +}; + +} // namespace keccak_tables +} // namespace plookup \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp new file mode 100644 index 0000000000..4a621c4d3c --- /dev/null +++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp @@ -0,0 +1,255 @@ +#pragma once + +#include <numeric/bitop/pow.hpp> +#include <common/constexpr_utils.hpp> +#include "../types.hpp" + +namespace plookup { +namespace keccak_tables { + +/** + * @brief Generates plookup tables required for THETA round of Keccak hash function + * + * Keccak has 25 hash lanes, each represented as 64-bit integers. The THETA round performs the following operation on 3 + * hash lanes: + * + * C0 = A0 ^ A1 ^ A2 ^ A3 ^ A4 + * C1 = B0 ^ B1 ^ B2 ^ B3 ^ B4 + * D = C0 ^ ROTATE_LEFT(C1, 1) + * + * We evaluate in-circuit using a base-11 sparse integer representation: + * + * P = \sum_{j=0}^63 b_i * 11^i + * + * In this representation we evaluate CHI via the linear expression + * + * C0 = A0 + A1 + A2 + A3 + A4 + * C1 = B0 + B1 + B2 + B3 + B4 + * D = C0 + ROTATE_LEFT(C1, 1) + * + * We use base-11 spare representation because the maximum value of each 'quasi-bit' of D is 10 + * + * THETA round uses a plookup table that normalizes the algebraic output. + * + * This can be represented via the 'truth table' for each base-11 quasi-bit: + * + * | Algebraic Output | Binary Output | + * | ---------------- | ------------- | + * | 0 | 0 | + * | 1 | 1 | + * | 2 | 0 | + * | 3 | 1 | + * | 4 | 0 | + * | 5 | 1 | + * | 6 | 0 | + * | 7 | 1 | + * | 8 | 0 | + * | 9 | 1 | + * | 10 | 0 | + * + * i.e. even = 0, odd = 1 + * + */ +class Theta { + public: + static constexpr size_t TABLE_BITS = 4; + static constexpr uint64_t BASE = 11; + + static constexpr uint64_t THETA_NORMALIZATION_TABLE[11]{ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + }; + + // template <size_t i> static std::pair<uint64_t, uint64_t> update_counts(std::array<size_t, TABLE_BITS>& counts) + // { + // ASSERT(i <= TABLE_BITS); + // if constexpr (i >= TABLE_BITS) { + // // TODO use concepts or template metaprogramming to put this condition in method declaration + // return std::make_pair(0, 0); + // } else { + // if (counts[i] == BASE - 1) { + // counts[i] = 0; + // return update_counts<i + 1>(counts); + // } else { + // counts[i] += 1; + // } + + // uint64_t value = 0; + // uint64_t normalized_value = 0; + // uint64_t cumulative_base = 1; + // for (size_t j = 0; j < TABLE_BITS; ++j) { + // value += counts[j] * cumulative_base; + // normalized_value += (THETA_NORMALIZATION_TABLE[counts[j]]) * cumulative_base; + // cumulative_base *= BASE; + // } + // return std::make_pair(value, normalized_value); + // } + // } + + /** + * @brief Given a table input value, return the table output value + * + * Used by the Plookup code to precompute lookup tables and generate witness values + * + * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value) + * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^TABLE_BITS - 1) + */ + static std::array<barretenberg::fr, 2> get_theta_renormalization_values(const std::array<uint64_t, 2> key) + { + uint64_t accumulator = 0; + uint64_t input = key[0]; + uint64_t base_shift = 1; + while (input > 0) { + uint64_t slice = input % BASE; + uint64_t bit = THETA_NORMALIZATION_TABLE[static_cast<size_t>(slice)]; + accumulator += (bit * base_shift); + input /= BASE; + base_shift *= BASE; + } + return { barretenberg::fr(accumulator), barretenberg::fr(0) }; + } + + /** + * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1]) + * Code is slightly faster at runtime if we compute this at compile time + * + * @return constexpr std::array<uint64_t, TABLE_BITS> + */ + static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases() + { + std::array<uint64_t, TABLE_BITS> result; + uint64_t acc = 1; + for (size_t i = 0; i < TABLE_BITS; ++i) { + result[i] = acc; + acc *= BASE; + } + return result; + } + + /** + * @brief Get column values for next row of plookup table. Used to generate plookup table row values + * + * Input `counts` is an array of quasi-bits that represent the current row. + * Method increases `counts` by 1 and returns the plookup table column values. + * + * @param counts The current row value represented as an array of quasi-bits + * @return std::array<uint64_t, 2> the columns of the plookup table + */ + static std::array<uint64_t, 2> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts) + { + static constexpr auto scaled_bases = get_scaled_bases(); + + for (size_t i = 0; i < TABLE_BITS; ++i) { + if (counts[i] == BASE - 1) { + counts[i] = 0; + } else { + counts[i] += 1; + break; + } + } + + uint64_t value = 0; + uint64_t normalized_value = 0; + for (size_t i = 0; i < TABLE_BITS; ++i) { + value += counts[i] * scaled_bases[i]; + normalized_value += (THETA_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i]; + } + return { value, normalized_value }; + } + + /** + * @brief Generate plookup table that normalizes a TABLE_BITS-slice of a base-11 integer + * + * @param id + * @param table_index + * @return BasicTable + */ + static BasicTable generate_theta_renormalization_table(BasicTableId id, const size_t table_index) + { + // max_base_value_plus_one sometimes may not equal base iff this is an intermediate lookup table + // (e.g. keccak, we have base11 values that need to be normalized where the actual values-per-base only range + // from [0, 1, 2]) + BasicTable table; + table.id = id; + table.table_index = table_index; + table.use_twin_keys = false; + table.size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS); + + std::array<size_t, TABLE_BITS> counts{}; + std::array<uint64_t, 2> column_values{ 0, 0 }; + + for (size_t i = 0; i < table.size; ++i) { + table.column_1.emplace_back(column_values[0]); + table.column_2.emplace_back(column_values[1]); + table.column_3.emplace_back(0); + column_values = get_column_values_for_next_row(counts); + } + + table.get_values_from_key = &get_theta_renormalization_values; + + constexpr uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS); + table.column_1_step_size = barretenberg::fr(step_size); + table.column_2_step_size = barretenberg::fr(step_size); + table.column_3_step_size = barretenberg::fr(0); + return table; + } + + /** + * @brief Create the THETA MultiTable used by plookup to generate a sequence of lookups + * + * The THETA round operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits. + * + * i.e. multiple lookups are required for a single 64-bit integer. + * + * If we group these lookups together, we can derive the plookup column values + * from the relative difference between wire values. + * + * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the + * output slices + * + * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where + * A = \sum_{i=0}^7 A_i * 11^8 + * B = \sum_{i=0}^7 B_i * 11^8 + * C_i = B_i / 11^8 (to get the most significant bit of B) + * + * Our plookup gates will produce a gates with the following wire values: + * + * | W1 | W2 | W3 | + * | ----------------------- | ----------------------- | --- | + * | \sum_{i=0}^7 A_i * 11^i | \sum_{i=0}^7 B_i * 11^i | --- | + * | \sum_{i=1}^7 A_i * 11^i | \sum_{i=1}^7 B_i * 11^i | --- | + * | \sum_{i=2}^7 A_i * 11^i | \sum_{i=2}^7 B_i * 11^i | --- | + * | ... | ... | ... | + * | A^7 | B^7 | --- | + * + * The plookup protocol extracts the 1st and 2nd lookup column values by taking: + * + * Colunn1 = W1[i] - 11^8 . W1[i + 1] + * Colunn2 = W2[i] - 11^8 . W2[i + 1] + * + * (where the -11^8 coefficient is stored in a precomputed selector polynomial) + * + * This MultiTable construction defines the value of these precomputed selector polynomial values, + * as well as defines how the column values are derived from a starting input value. + * + * @param id + * @return MultiTable + */ + static MultiTable get_theta_output_table(const MultiTableId id = KECCAK_THETA_OUTPUT) + { + constexpr size_t num_tables_per_multitable = + (64 / TABLE_BITS) + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 5 bits per entry + + uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS); + MultiTable table(column_multiplier, column_multiplier, 0, num_tables_per_multitable); + + table.id = id; + for (size_t i = 0; i < num_tables_per_multitable; ++i) { + table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS)); + table.lookup_ids.emplace_back(KECCAK_THETA); + table.get_table_values.emplace_back(&get_theta_renormalization_values); + } + return table; + } +}; +} // namespace keccak_tables +} // namespace plookup \ No newline at end of file diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp index 26cae5826a..299bc8966f 100644 --- a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp +++ b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp @@ -1,4 +1,5 @@ #include "plookup_tables.hpp" +#include <common/constexpr_utils.hpp> namespace plookup { @@ -82,6 +83,19 @@ void init_multi_tables() blake2s_tables::get_blake2s_xor_rotate_8_table(MultiTableId::BLAKE_XOR_ROTATE_8); MULTI_TABLES[MultiTableId::BLAKE_XOR_ROTATE_7] = blake2s_tables::get_blake2s_xor_rotate_7_table(MultiTableId::BLAKE_XOR_ROTATE_7); + MULTI_TABLES[MultiTableId::KECCAK_FORMAT_INPUT] = + keccak_tables::KeccakInput::get_keccak_input_table(MultiTableId::KECCAK_FORMAT_INPUT); + MULTI_TABLES[MultiTableId::KECCAK_THETA_OUTPUT] = + keccak_tables::Theta::get_theta_output_table(MultiTableId::KECCAK_THETA_OUTPUT); + MULTI_TABLES[MultiTableId::KECCAK_CHI_OUTPUT] = + keccak_tables::Chi::get_chi_output_table(MultiTableId::KECCAK_CHI_OUTPUT); + MULTI_TABLES[MultiTableId::KECCAK_FORMAT_OUTPUT] = + keccak_tables::KeccakOutput::get_keccak_output_table(MultiTableId::KECCAK_FORMAT_OUTPUT); + + barretenberg::constexpr_for<0, 25, 1>([&]<size_t i>() { + MULTI_TABLES[(size_t)MultiTableId::KECCAK_NORMALIZE_AND_ROTATE + i] = + keccak_tables::Rho<8, i>::get_rho_output_table(MultiTableId::KECCAK_NORMALIZE_AND_ROTATE); + }); } } // namespace diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp index 518e025982..e1e30bcb44 100644 --- a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp +++ b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp @@ -9,6 +9,12 @@ #include "uint.hpp" #include "non_native_group_generator.hpp" #include "blake2s.hpp" +#include "keccak/keccak_chi.hpp" +#include "keccak/keccak_input.hpp" +#include "keccak/keccak_output.hpp" +#include "keccak/keccak_rho.hpp" +#include "keccak/keccak_theta.hpp" + namespace plookup { const MultiTable& create_table(const MultiTableId id); @@ -236,6 +242,42 @@ inline BasicTable create_basic_table(const BasicTableId id, const size_t index) case PEDERSEN_IV_BASE: { return pedersen_tables::basic::generate_pedersen_iv_table(PEDERSEN_IV_BASE); } + case KECCAK_INPUT: { + return keccak_tables::KeccakInput::generate_keccak_input_table(KECCAK_INPUT, index); + } + case KECCAK_THETA: { + return keccak_tables::Theta::generate_theta_renormalization_table(KECCAK_THETA, index); + } + case KECCAK_CHI: { + return keccak_tables::Chi::generate_chi_renormalization_table(KECCAK_CHI, index); + } + case KECCAK_OUTPUT: { + return keccak_tables::KeccakOutput::generate_keccak_output_table(KECCAK_OUTPUT, index); + } + case KECCAK_RHO_1: { + return keccak_tables::Rho<1>::generate_rho_renormalization_table(KECCAK_RHO_1, index); + } + case KECCAK_RHO_2: { + return keccak_tables::Rho<2>::generate_rho_renormalization_table(KECCAK_RHO_2, index); + } + case KECCAK_RHO_3: { + return keccak_tables::Rho<3>::generate_rho_renormalization_table(KECCAK_RHO_3, index); + } + case KECCAK_RHO_4: { + return keccak_tables::Rho<4>::generate_rho_renormalization_table(KECCAK_RHO_4, index); + } + case KECCAK_RHO_5: { + return keccak_tables::Rho<5>::generate_rho_renormalization_table(KECCAK_RHO_5, index); + } + case KECCAK_RHO_6: { + return keccak_tables::Rho<6>::generate_rho_renormalization_table(KECCAK_RHO_6, index); + } + case KECCAK_RHO_7: { + return keccak_tables::Rho<7>::generate_rho_renormalization_table(KECCAK_RHO_7, index); + } + case KECCAK_RHO_8: { + return keccak_tables::Rho<8>::generate_rho_renormalization_table(KECCAK_RHO_8, index); + } default: { throw_or_abort("table id does not exist"); return sparse_tables::generate_sparse_table_with_rotation<9, 8, 0>(AES_SPARSE_MAP, index); diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp index c440459848..142b948d6b 100644 --- a/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp +++ b/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp @@ -83,6 +83,20 @@ enum BasicTableId { PEDERSEN_1, PEDERSEN_0, PEDERSEN_IV_BASE, + KECCAK_INPUT, + KECCAK_THETA, + KECCAK_RHO, + KECCAK_CHI, + KECCAK_OUTPUT, + KECCAK_RHO_1, + KECCAK_RHO_2, + KECCAK_RHO_3, + KECCAK_RHO_4, + KECCAK_RHO_5, + KECCAK_RHO_6, + KECCAK_RHO_7, + KECCAK_RHO_8, + KECCAK_RHO_9, }; enum MultiTableId { @@ -122,7 +136,12 @@ enum MultiTableId { BLAKE_XOR_ROTATE_8, BLAKE_XOR_ROTATE_7, PEDERSEN_IV, - NUM_MULTI_TABLES, + KECCAK_THETA_OUTPUT, + KECCAK_CHI_OUTPUT, + KECCAK_FORMAT_INPUT, + KECCAK_FORMAT_OUTPUT, + KECCAK_NORMALIZE_AND_ROTATE, + NUM_MULTI_TABLES = KECCAK_NORMALIZE_AND_ROTATE + 25, }; struct MultiTable { diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.cpp b/cpp/src/aztec/plonk/composer/ultra_composer.cpp index 6de7357e35..5cb997b1ef 100644 --- a/cpp/src/aztec/plonk/composer/ultra_composer.cpp +++ b/cpp/src/aztec/plonk/composer/ultra_composer.cpp @@ -1102,6 +1102,20 @@ std::vector<uint32_t> UltraComposer::decompose_into_default_range(const uint32_t } const uint64_t sublimb_mask = (1ULL << target_range_bitnum) - 1; + + /** + * TODO: Support this commented-out code! + * At the moment, `decompose_into_default_range` generates a minimum of 1 arithmetic gate. + * This is not strictly required iff num_bits <= target_range_bitnum. + * However, this produces an edge-case where a variable is range-constrained but NOT present in an arithmetic gate. + * This in turn produces an unsatisfiable circuit (see `create_new_range_constraint`). We would need to check for + * and accomodate/reject this edge case to support not adding addition gates here if not reqiured + * if (num_bits <= target_range_bitnum) { + * const uint64_t expected_range = (1ULL << num_bits) - 1ULL; + * create_new_range_constraint(variable_index, expected_range); + * return { variable_index }; + * } + **/ std::vector<uint64_t> sublimbs; std::vector<uint32_t> sublimb_indices; diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.hpp b/cpp/src/aztec/plonk/composer/ultra_composer.hpp index 74ce23509c..c5880d04db 100644 --- a/cpp/src/aztec/plonk/composer/ultra_composer.hpp +++ b/cpp/src/aztec/plonk/composer/ultra_composer.hpp @@ -150,6 +150,15 @@ class UltraComposer : public ComposerBase { void create_range_constraint(const uint32_t variable_index, const size_t num_bits, std::string const&) { if (num_bits <= DEFAULT_PLOOKUP_RANGE_BITNUM) { + /** + * N.B. if `variable_index` is not used in any arithmetic constraints, this will create an unsatisfiable + * circuit! + * this range constraint will increase the size of the 'sorted set' of range-constrained integers by 1. + * The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all arithmetic + * gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking for this + * and throwing an error would require a refactor of the Composer to catelog all 'orphan' variables not + * assigned to gates. + **/ create_new_range_constraint(variable_index, 1ULL << num_bits); } else { decompose_into_default_range(variable_index, num_bits); @@ -178,6 +187,10 @@ class UltraComposer : public ComposerBase { */ virtual size_t get_num_gates() const override { + // if circuit finalised already added extra gates + if (circuit_finalised) { + return num_gates; + } size_t count = num_gates; size_t rangecount = 0; size_t romcount = 0; @@ -208,7 +221,13 @@ class UltraComposer : public ComposerBase { { size_t count = num_gates; size_t rangecount = 0; + size_t constant_rangecount = 0; size_t romcount = 0; + size_t plookupcount = 0; + for (auto& table : lookup_tables) { + plookupcount += table.lookup_gates.size(); + count -= table.lookup_gates.size(); + } for (size_t i = 0; i < rom_arrays.size(); ++i) { for (size_t j = 0; j < rom_arrays[i].state.size(); ++j) { if (rom_arrays[i].state[j][0] == UNINITIALIZED_MEMORY_RECORD) { @@ -228,9 +247,15 @@ class UltraComposer : public ComposerBase { list_size += padding; rangecount += (list_size / gate_width); rangecount += 1; // we need to add 1 extra addition gates for every distinct range list + + // rough estimate + const size_t constant_cost = static_cast<size_t>(list.second.target_range / 6); + constant_rangecount += constant_cost; + rangecount -= constant_cost; } - size_t total = count + romcount + rangecount; - std::cout << "gates = " << total << " (arith " << count << ", rom " << romcount << ", range " << rangecount + size_t total = count + romcount + rangecount + constant_rangecount + plookupcount; + std::cout << "gates = " << total << " (arith " << count << ", plookup " << plookupcount << ", rom " << romcount + << ", range " << rangecount << ", range table init cost = " << constant_rangecount << "), pubinp = " << public_inputs.size() << std::endl; } diff --git a/cpp/src/aztec/stdlib/hash/CMakeLists.txt b/cpp/src/aztec/stdlib/hash/CMakeLists.txt index d8b6078d10..c8aaf1e4b4 100644 --- a/cpp/src/aztec/stdlib/hash/CMakeLists.txt +++ b/cpp/src/aztec/stdlib/hash/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(blake2s) add_subdirectory(blake3s) add_subdirectory(pedersen) -add_subdirectory(sha256) \ No newline at end of file +add_subdirectory(sha256) +add_subdirectory(keccak) \ No newline at end of file diff --git a/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt b/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt new file mode 100644 index 0000000000..d15f3dc401 --- /dev/null +++ b/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt @@ -0,0 +1 @@ +barretenberg_module(stdlib_keccak stdlib_primitives crypto_keccak) \ No newline at end of file diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp new file mode 100644 index 0000000000..2a138a5a11 --- /dev/null +++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp @@ -0,0 +1,620 @@ +#include "keccak.hpp" +#include <plonk/composer/ultra_composer.hpp> +#include <stdlib/primitives/uint/uint.hpp> +#include <common/constexpr_utils.hpp> +#include <numeric/bitop/sparse_form.hpp> +namespace plonk { +namespace stdlib { + +/** + * @brief Normalize a base-11 limb and left-rotate by keccak::ROTATIONS[lane_index] bits. + * This method also extracts the most significant bit of the normalised rotated limb. + * Used in the RHO and IOTA rounds and in `sponge_absorb`. + * + * Normalize process: + * Input v = \sum_{i=0}^63 b_i * 11^i , where b is in range [0, 1, 2] + * Output = \sum_{i=0}^63 (b_i & 1) * 11^i (i.e. even values go to 0) + * + * Implementation is via a sequence of lookup tables + * + * @tparam lane_index What keccak lane are we working on? + * @param limb Input limb we want to normalize and rotate + * @param msb (return parameter) The most significant bit of the normalized and rotated limb + * @return field_t<Composer> The normalized and rotated output + */ +template <typename Composer> +template <size_t lane_index> +field_t<Composer> keccak<Composer>::normalize_and_rotate(const field_ct& limb, field_ct& msb) +{ + // left_bits = the number of bits that wrap around 11^64 (left_bits) + constexpr size_t left_bits = ROTATIONS[lane_index]; + + // right_bits = the number of bits that don't wrap + constexpr size_t right_bits = 64 - ROTATIONS[lane_index]; + + // TODO read from same source as plookup table code + constexpr size_t max_bits_per_table = plookup::keccak_tables::Rho<>::MAXIMUM_MULTITABLE_BITS; + + // compute the number of lookups required for our left and right bit slices + constexpr size_t num_left_tables = left_bits / max_bits_per_table + (left_bits % max_bits_per_table > 0 ? 1 : 0); + constexpr size_t num_right_tables = right_bits / max_bits_per_table + (right_bits % max_bits_per_table > 0 ? 1 : 0); + + // get the numerical value of the left and right bit slices + // (lookup table input values derived from left / right) + uint256_t input = limb.get_value(); + constexpr uint256_t slice_divisor = BASE.pow(right_bits); + const auto [left, right] = input.divmod(slice_divisor); + + // compute the normalized values for the left and right bit slices + // (lookup table output values derived from left_normalised / right_normalized) + uint256_t left_normalized = normalize_sparse(left); + uint256_t right_normalized = normalize_sparse(right); + + /** + * manually construct the ReadData object required to generate plookup gate constraints. + * To explain in more detail: the input integer can be represented via two the bit slices [A, B] + * (A = left, B = right) + * + * For example, imagine our input is a 32-bit integer A represented as: A = A3.11^24 + A2.11^16 + A1.11^8 + A0, + * and our output is a 32-bit integer B = B3.11^24 + B2.11^16 + B1.11^8 + B0 + * + * In this example, we want to normalize A and left-rotate by 16 bits. + * + * Our lookup gate wire values will look like the following: + * + * | Row | C0 | C1 | C2 | + * | --- | -----------------------------------------| ------------ | -------- | + * | 0 | A3.11^24 + A2.11^16 + A1.11^8 + A0 | B1.11^8 + B0 | A0.msb() | + * | 1 | A3.11^16 + A2.11^8 + A1 | B1 | A1.msb() | + * | 2 | A1311^8 + A2 | B3.11^8 + B2 | A2.msb() | + * | 3 | A3 | B3 | A3.msb() | + * + * The plookup table keys + values are derived via the expression: + * + * C1[i] + C1[i+1].q1[i] = LOOKUP[C0[i] + C0[i+1].q0[i]] + * + * (the same applies for C2, however q2[i] = 0 for all rows) + * + * The plookup coefficients for the rows treat Column0 as a single accumulating sum, + * but Column1 is a pair of accumulating sums. + * In the above example, the q coefficient value are: + * + * | Row | Q1 | Q2 | Q3 | + * | --- | ---- | ---- | -- | + * | 0 | 11^8 | 11^8 | 0 | + * | 1 | 11^8 | 0 | 0 | + * | 2 | 11^8 | 11^8 | 0 | + * | 3 | 0 | 0 | 0 | + * + * stdlib::plookup cannot derive witnesses in the above pattern without a substantial rewrite, + * so we do it manually in this method! + **/ + plookup::ReadData<barretenberg::fr> lookup; + + // compute plookup witness values for a given slice + // (same lambda can be used to compute witnesses for left and right slices) + auto compute_lookup_witnesses_for_limb = [&]<size_t limb_bits, size_t num_lookups>(uint256_t& normalized) { + // (use a constexpr loop to make some pow and div operations compile-time) + barretenberg::constexpr_for<0, num_lookups, 1>([&]<size_t i> { + constexpr size_t num_bits_processed = i * max_bits_per_table; + + // How many bits can this slice contain? + // We want to implicitly range-constrain `normalized < 11^{limb_bits}`, + // which means potentially using a lookup table that is not of size 11^{max_bits_per_table} + // for the most-significant slice + constexpr size_t bit_slice = (num_bits_processed + max_bits_per_table > limb_bits) + ? limb_bits % max_bits_per_table + : max_bits_per_table; + + // current column values are tracked via 'input' and 'normalized' + lookup[ColumnIdx::C1].push_back(input); + lookup[ColumnIdx::C2].push_back(normalized); + + constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), bit_slice); + constexpr uint64_t msb_divisor = divisor / static_cast<uint64_t>(BASE); + + // compute the value of the most significant bit of this slice and store in C3 + const auto [normalized_quotient, normalized_slice] = normalized.divmod(divisor); + + // 256-bit divisions are expensive! cast to u64s when we don't need the extra bits + const uint64_t normalized_msb = (static_cast<uint64_t>(normalized_slice) / msb_divisor); + lookup[ColumnIdx::C3].push_back(normalized_msb); + + // We need to provide a key/value object for this lookup in order for the Composer + // to compute the plookup sorted list commitment + const auto [input_quotient, input_slice] = input.divmod(divisor); + lookup.key_entries.push_back( + { { static_cast<uint64_t>(input_slice), 0 }, { normalized_slice, normalized_msb } }); + + // reduce the input and output by 11^{bit_slice} + input = input_quotient; + normalized = normalized_quotient; + }); + }; + + // template lambda syntax is a little funky. + // Need to explicitly write `.template operator()` (instead of just `()`). + // Otherwise compiler cannot distinguish between `>` symbol referring to closing the template parameter list, + // OR `>` being a greater-than operator :/ + compute_lookup_witnesses_for_limb.template operator()<right_bits, num_right_tables>(right_normalized); + compute_lookup_witnesses_for_limb.template operator()<left_bits, num_left_tables>(left_normalized); + + // Call composer method to create plookup constraints. + // The MultiTable table index can be derived from `lane_idx` + // Each lane_idx has a different rotation amount, which changes sizes of left/right slices + // and therefore the selector constants required (i.e. the Q1, Q2, Q3 values in the earlier example) + const auto accumulator_witnesses = limb.context->create_gates_from_plookup_accumulators( + (plookup::MultiTableId)((size_t)KECCAK_NORMALIZE_AND_ROTATE + lane_index), + lookup, + limb.normalize().get_witness_index()); + + // extract the most significant bit of the normalized output from the final lookup entry in column C3 + msb = field_ct::from_witness_index(limb.get_context(), + accumulator_witnesses[ColumnIdx::C3][num_left_tables + num_right_tables - 1]); + + // Extract the witness that maps to the normalized right slice + const field_t<Composer> right_output = + field_t<Composer>::from_witness_index(limb.get_context(), accumulator_witnesses[ColumnIdx::C2][0]); + + if (num_left_tables == 0) { + // if the left slice size is 0 bits (i.e. no rotation), return `right_output` + return right_output; + } else { + // Extract the normalized left slice + const field_t<Composer> left_output = field_t<Composer>::from_witness_index( + limb.get_context(), accumulator_witnesses[ColumnIdx::C2][num_right_tables]); + + // Stitch the right/left slices together to create our rotated output + constexpr uint256_t shift = BASE.pow(ROTATIONS[lane_index]); + return (left_output + right_output * shift); + } +} + +/** + * @brief Compute twisted representation of hash lane + * + * The THETA round requires computation of XOR(A, ROTL(B, 1)) + * + * We do this via a 'twisted' base-11 representation. + * + * If the bit slices for a regular variable are arranged [b63, ..., b0], + * the twisted representation is a 65-bit variable [b63, ..., b0, b63] + * + * The equivalent of XOR(A, ROTL(B, 1)) is A.twist + 2B.twist (in base-11 form) + * The output is present in bit slices 1-64 + * + * @tparam Composer + * @param internal + */ +template <typename Composer> void keccak<Composer>::compute_twisted_state(keccak_state& internal) +{ + for (size_t i = 0; i < 25; ++i) { + internal.twisted_state[i] = ((internal.state[i] * 11) + internal.state_msb[i]).normalize(); + } +} + +/** + * @brief THETA round + * + * @tparam Composer + * + * THETA consists of XOR operations as well as left rotations by 1 bit. + * + * We represent 64-bit integers in a base-11 representation where + * limb = \sum_{i=0}^63 b_i * 11^i + * + * At the start of THETA, all b_i values are either 0 or 1 + * + * We can efficiently evaluate XOR operations via simple additions! + * If b_i = even, this represents a bit value of 0 + * If b_i = odd, this represents a bit value of 1 + * + * The KECCAK_THETA_OUTPUT lookup table is used to 'normalize' base-11 integers, + * i.e. convert b_i values from [0, ..., 10] to [0, 1] where even == 0, odd == 1 + * + * The choice of base for our representation effects the following: + * 1. the number of normalization lookups required to avoid overflowing the base + * 2. the cost of normalization lookups + * + * Bigger base reduces (1) but increases (2). For THETA, base-11 is optimal (I think...) + * + * ### HANDLING ROTATIONS + * + * We need to left-rotate the C[5] array by 1-bit to compute D[5]. Naive way is expensive so we cheat! + * When converting integers into base-11 representation, we use a lookup table column to give us the + * most significant bit of the integer. + * + * This enables us to create a 'twisted' representation of the integer in base-11: + * + * twisted_limb = (b_63) + \sum_{i=0}^63 b_i * 11^{i + 1} + * + * e.g. if limb's bit ordering is [0, b63, ..., b1, b0 ] + * twisted limb bit ordering is [b63, b62, ..., b0, b63] + * + * We want to be able to compute XOR(A, B.rotate_left(1)) and can do this via twisted representations + * + * The equivalent in base-11 world is twisted_A * 2 + twisted_B. + * The output of the XOR operation exists in bit-slices 1, ..., 63 + * (which can be extracted by removing the least and most significant slices of the output) + * This is MUCH cheaper than the extra range constraints required for a naive left-rotation + * + * Total cost of theta = 20.5 gates per 5 lanes + 25 = 127.5 per round + */ +template <typename Composer> void keccak<Composer>::theta(keccak_state& internal) +{ + std::array<field_ct, 5> C; + std::array<field_ct, 5> D; + + auto& state = internal.state; + const auto& twisted_state = internal.twisted_state; + for (size_t i = 0; i < 5; ++i) { + + /** + * field_ct::accumulate can compute 5 addition operations in only 2 gates: + * Gate 0 wires [a0, a1, a2, a3] + * Gate 1 wires [b0, b1, b2, b3] + * b3 = a0 + a1 + a2 + a3 + * b2 = b3 + b0 + b1 + * (b2 is the output wire) + **/ + C[i] = field_ct::accumulate({ twisted_state[i], + twisted_state[5 + i], + twisted_state[10 + i], + twisted_state[15 + i], + twisted_state[20 + i] }); + } + + /** + * Compute D by exploiting twisted representation + * to get a cheap left-rotation by 1 bit + */ + for (size_t i = 0; i < 5; ++i) { + const auto non_shifted_equivalent = (C[(i + 4) % 5]); + const auto shifted_equivalent = C[(i + 1) % 5] * BASE; + D[i] = (non_shifted_equivalent + shifted_equivalent); + } + + /** + * D contains 66 base-11 slices. + * + * We need to remove the 2 most significant slices as they + * are artifacts of our twist operation. + * + * We also need to 'normalize' D (i.e. convert each base value to be 0 or 1), + * to prevent our base from overflowing when we XOR D into internal.state + * + * 1. create sliced_D witness, plus lo and hi slices + * 2. validate D == lo + (sliced_D * 11) + (hi * 11^65) + * 3. feed sliced_D into KECCAK_THETA_OUTPUT lookup table + * + * KECCAK_THETA_OUTPUT currently splices its input into 16 4-bit slices (in base 11 i.e. from 0 to 11^4 - 1) + * This ensures that sliced_D is correctly range constrained to be < 11^64 + */ + static constexpr uint256_t divisor = BASE.pow(64); + static constexpr uint256_t multiplicand = BASE.pow(65); + for (size_t i = 0; i < 5; ++i) { + uint256_t D_native = D[i].get_value(); + const auto [D_quotient, lo_native] = D_native.divmod(BASE); + const uint256_t hi_native = D_quotient / divisor; + const uint256_t mid_native = D_quotient - hi_native * divisor; + + field_ct hi(witness_ct(internal.context, hi_native)); + field_ct mid(witness_ct(internal.context, mid_native)); + field_ct lo(witness_ct(internal.context, lo_native)); + + // assert equal should cost 1 gate (multipliers are all constants) + D[i].assert_equal((hi * multiplicand).add_two(mid * 11, lo)); + internal.context->create_new_range_constraint(hi.get_witness_index(), static_cast<uint64_t>(BASE)); + internal.context->create_new_range_constraint(lo.get_witness_index(), static_cast<uint64_t>(BASE)); + + // If number of bits in KECCAK_THETA_OUTPUT table does NOT cleanly divide 64, + // we need an additional range constraint to ensure that mid < 11^64 + if constexpr (64 % plookup::keccak_tables::Theta::TABLE_BITS == 0) { + // N.B. we could optimize out 5 gates per round here but it's very fiddly... + // In previous section, D[i] = X + Y (non shifted equiv and shifted equiv) + // We also want to validate D[i] == hi' + mid' + lo (where hi', mid' are hi, mid scaled by constants) + // We *could* create a big addition gate to validate the previous logic w. following structure: + // | w1 | w2 | w3 | w4 | + // | -- | --- | -- | -- | + // | hi | mid | lo | X | + // | P0 | P1 | P2 | Y | + // To save a gate, we would need to place the wires for the first KECCAK_THETA_OUTPUT plookup gate + // at P0, P1, P2. This is fiddly composer logic that is circuit-width-dependent + // (this would save 120 gates per hash block... not worth making the code less readable for that) + D[i] = plookup_read::read_from_1_to_2_table(KECCAK_THETA_OUTPUT, mid); + } else { + const auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_THETA_OUTPUT, D[i]); + D[i] = accumulators[ColumnIdx::C2][0]; + + // Ensure input to lookup is < 11^64, + // by validating most significant input slice is < 11^{64 mod slice_bits} + const field_ct most_significant_slice = accumulators[ColumnIdx::C1][accumulators[ColumnIdx::C1].size() - 1]; + + // N.B. cheaper to validate (11^{64 mod slice_bits} - slice < 2^14) as this + // prevents an extra range table from being created + constexpr uint256_t maximum = BASE.pow(64 % plookup::keccak_tables::Theta::TABLE_BITS); + const field_ct target = -most_significant_slice + maximum; + ASSERT(((uint256_t(1) << Composer::DEFAULT_PLOOKUP_RANGE_BITNUM) - 1) > maximum); + target.create_range_constraint(Composer::DEFAULT_PLOOKUP_RANGE_BITNUM, + "input to KECCAK_THETA_OUTPUT too large!"); + } + } + + // compute state[j * 5 + i] XOR D[i] in base-11 representation + for (size_t i = 0; i < 5; ++i) { + for (size_t j = 0; j < 5; ++j) { + state[j * 5 + i] = state[j * 5 + i] + D[i]; + } + } +} + +/** + * @brief RHO round + * + * @tparam Composer + * + * The limbs of internal.state are represented via base-11 integers + * limb = \sum_{i=0}^63 b_i * 11^i + * The value of each b_i can be in the range [0, 1, 2] due to the THETA round XOR operations + * + * We need to do the following: + * + * 1. 'normalize' each limb so that each b_i value is 0 or 1 + * 2. left-rotate each limb as defined by the keccak `rotations` matrix + * + * The KECCAK_RHO_OUTPUT lookup table is used for both. See `normalize_and_rotate` for more details + * + * COST PER LIMB... + * 8 gates for first lane (no rotation. Lookup table is 8-bits per slice = 8 lookups for 64 bits) + * 10 gates for other 24 lanes (lookup sequence is split into 6 8-bit slices and 2 slices that sum to 8 bits, + * an addition gate is required to complete the rotation) + * + * Total costs is 248 gates. + * + * N.B. Can reduce lookup costs by using larger lookup tables. + * Current algo is optimized for lookup tables where sum of all table sizes is < 2^64 + */ +template <typename Composer> void keccak<Composer>::rho(keccak_state& internal) +{ + constexpr_for<0, 25, 1>( + [&]<size_t i>() { internal.state[i] = normalize_and_rotate<i>(internal.state[i], internal.state_msb[i]); }); +} + +/** + * @brief PI + * + * PI permutes the keccak lanes. Adds 0 constraints as this is simply a + * re-ordering of witnesses + * + * @tparam Composer + * @param internal + */ +template <typename Composer> void keccak<Composer>::pi(keccak_state& internal) +{ + std::array<field_ct, 25> B; + + for (size_t j = 0; j < 5; ++j) { + for (size_t i = 0; i < 5; ++i) { + B[j * 5 + i] = internal.state[j * 5 + i]; + } + } + + for (size_t y = 0; y < 5; ++y) { + for (size_t x = 0; x < 5; ++x) { + size_t u = (0 * x + 1 * y) % 5; + size_t v = (2 * x + 3 * y) % 5; + + internal.state[v * 5 + u] = B[5 * y + x]; + } + } +} + +/** + * @brief CHI + * + * The CHI round applies the following logic to the hash lanes: + * A XOR (~B AND C) + * + * In base-11 representation we can create an equivalent linear operation: + * 1 + 2A - B + C + * + * Output values will range from [0, 1, 2, 3, 4] and are mapped back into [0, 1] + * via the KECCAK_CHI_OUTPUT lookup table + * + * N.B. the KECCAK_CHI_OUTPUT table also has a column for the most significant bit of each lookup. + * We use this to create a 'twisted representation of each hash lane (see THETA comments for more details) + * @tparam Composer + */ +template <typename Composer> void keccak<Composer>::chi(keccak_state& internal) +{ + // (cost = 12 * 25 = 300?) + auto& state = internal.state; + + for (size_t y = 0; y < 5; ++y) { + std::array<field_ct, 5> lane_outputs; + for (size_t x = 0; x < 5; ++x) { + const auto A = state[y * 5 + x]; + const auto B = state[y * 5 + ((x + 1) % 5)]; + const auto C = state[y * 5 + ((x + 2) % 5)]; + + // vv should cost 1 gate + lane_outputs[x] = (A + A + CHI_OFFSET).add_two(-B, C); + } + for (size_t x = 0; x < 5; ++x) { + // Normalize lane outputs and assign to internal.state + auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_CHI_OUTPUT, lane_outputs[x]); + internal.state[y * 5 + x] = accumulators[ColumnIdx::C2][0]; + internal.state_msb[y * 5 + x] = accumulators[ColumnIdx::C3][accumulators[ColumnIdx::C3].size() - 1]; + } + } +} + +/** + * @brief IOTA + * + * XOR first hash limb with a precomputed constant. + * We re-use the RHO_OUTPUT table to normalize after this operation + * @tparam Composer + * @param internal + * @param round + */ +template <typename Composer> void keccak<Composer>::iota(keccak_state& internal, size_t round) +{ + const field_ct xor_result = internal.state[0] + SPARSE_RC[round]; + + // normalize lane value so that we don't overflow our base11 modulus boundary in the next round + internal.state[0] = normalize_and_rotate<0>(xor_result, internal.state_msb[0]); + + // No need to add constraints to compute twisted repr if this is the last round + if (round != 23) { + compute_twisted_state(internal); + } +} + +template <typename Composer> void keccak<Composer>::keccakf1600(keccak_state& internal) +{ + for (size_t i = 0; i < 24; ++i) { + theta(internal); + rho(internal); + pi(internal); + chi(internal); + iota(internal, i); + } +} + +template <typename Composer> +void keccak<Composer>::sponge_absorb(keccak_state& internal, + const std::vector<field_ct>& input_buffer, + const std::vector<field_ct>& msb_buffer) +{ + const size_t l = input_buffer.size(); + + const size_t num_blocks = l / (BLOCK_SIZE / 8); + + for (size_t i = 0; i < num_blocks; ++i) { + if (i == 0) { + for (size_t j = 0; j < LIMBS_PER_BLOCK; ++j) { + internal.state[j] = input_buffer[j]; + internal.state_msb[j] = msb_buffer[j]; + } + for (size_t j = LIMBS_PER_BLOCK; j < 25; ++j) { + internal.state[j] = witness_ct::create_constant_witness(internal.context, 0); + internal.state_msb[j] = witness_ct::create_constant_witness(internal.context, 0); + } + } else { + for (size_t j = 0; j < LIMBS_PER_BLOCK; ++j) { + internal.state[j] += input_buffer[i * LIMBS_PER_BLOCK + j]; + + internal.state[j] = normalize_and_rotate<0>(internal.state[j], internal.state_msb[j]); + } + } + + compute_twisted_state(internal); + keccakf1600(internal); + } +} + +template <typename Composer> byte_array<Composer> keccak<Composer>::sponge_squeeze(keccak_state& internal) +{ + byte_array_ct result(internal.context); + + // Each hash limb represents a little-endian integer. Need to reverse bytes before we write into the output array + for (size_t i = 0; i < 4; ++i) { + field_ct output_limb = plookup_read::read_from_1_to_2_table(KECCAK_FORMAT_OUTPUT, internal.state[i]); + byte_array_ct limb_bytes(output_limb, 8); + byte_array_ct little_endian_limb_bytes(internal.context, 8); + little_endian_limb_bytes.set_byte(0, limb_bytes[7]); + little_endian_limb_bytes.set_byte(1, limb_bytes[6]); + little_endian_limb_bytes.set_byte(2, limb_bytes[5]); + little_endian_limb_bytes.set_byte(3, limb_bytes[4]); + little_endian_limb_bytes.set_byte(4, limb_bytes[3]); + little_endian_limb_bytes.set_byte(5, limb_bytes[2]); + little_endian_limb_bytes.set_byte(6, limb_bytes[1]); + little_endian_limb_bytes.set_byte(7, limb_bytes[0]); + result.write(little_endian_limb_bytes); + } + return result; +} + +template <typename Composer> stdlib::byte_array<Composer> keccak<Composer>::hash(byte_array_ct& input) +{ + auto ctx = input.get_context(); + + if (ctx == nullptr) { + // if buffer is constant compute hash and return w/o creating constraints + byte_array_ct output(nullptr, 32); + const std::vector<uint8_t> result = hash_native(input.get_value()); + for (size_t i = 0; i < 32; ++i) { + output.set_byte(i, result[i]); + } + return output; + } + + const size_t input_size = input.size(); + + // copy input into buffer and pad + const size_t blocks = input_size / BLOCK_SIZE; + const size_t blocks_length = (BLOCK_SIZE * (blocks + 1)); + + byte_array_ct block_bytes(input); + + const size_t byte_difference = blocks_length - input_size; + byte_array_ct padding_bytes(ctx, byte_difference); + for (size_t i = 0; i < byte_difference; ++i) { + padding_bytes.set_byte(i, witness_ct::create_constant_witness(ctx, 0)); + } + + block_bytes.write(padding_bytes); + block_bytes.set_byte(input_size, witness_ct::create_constant_witness(ctx, 0x1)); + block_bytes.set_byte(block_bytes.size() - 1, witness_ct::create_constant_witness(ctx, 0x80)); + + // keccak lanes interpret memory as little-endian integers, + // means we need to swap our byte ordering... + for (size_t i = 0; i < block_bytes.size(); i += 8) { + std::array<field_ct, 8> temp; + for (size_t j = 0; j < 8; ++j) { + temp[j] = block_bytes[i + j]; + } + block_bytes.set_byte(i, temp[7]); + block_bytes.set_byte(i + 1, temp[6]); + block_bytes.set_byte(i + 2, temp[5]); + block_bytes.set_byte(i + 3, temp[4]); + block_bytes.set_byte(i + 4, temp[3]); + block_bytes.set_byte(i + 5, temp[2]); + block_bytes.set_byte(i + 6, temp[1]); + block_bytes.set_byte(i + 7, temp[0]); + } + const size_t byte_size = block_bytes.size(); + keccak_state internal; + internal.context = ctx; + + const size_t num_limbs = byte_size / WORD_SIZE; + std::vector<field_ct> converted_buffer(num_limbs); + std::vector<field_ct> msb_buffer(num_limbs); + + for (size_t i = 0; i < num_limbs; ++i) { + field_ct sliced; + if (i * WORD_SIZE + WORD_SIZE > byte_size) { + const size_t slice_size = byte_size - (i * WORD_SIZE); + const size_t byte_shift = (WORD_SIZE - slice_size) * 8; + sliced = field_ct(block_bytes.slice(i * WORD_SIZE, slice_size)); + sliced = (sliced * (uint256_t(1) << byte_shift)).normalize(); + } else { + sliced = field_ct(block_bytes.slice(i * WORD_SIZE, WORD_SIZE)); + } + const auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_FORMAT_INPUT, sliced); + converted_buffer[i] = accumulators[ColumnIdx::C2][0]; + msb_buffer[i] = accumulators[ColumnIdx::C3][accumulators[ColumnIdx::C3].size() - 1]; + } + + sponge_absorb(internal, converted_buffer, msb_buffer); + + auto result = sponge_squeeze(internal); + + return result; +} + +template class keccak<plonk::UltraComposer>; + +} // namespace stdlib +} // namespace plonk \ No newline at end of file diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp new file mode 100644 index 0000000000..c71612392d --- /dev/null +++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp @@ -0,0 +1,189 @@ +#pragma once +#include <array> +#include <stdlib/primitives/uint/uint.hpp> +#include <stdlib/primitives/packed_byte_array/packed_byte_array.hpp> +#include <stdlib/primitives/byte_array/byte_array.hpp> + +namespace plonk { +class UltraComposer; +} // namespace plonk + +namespace plonk { +namespace stdlib { +template <typename Composer> class bit_array; + +/** + * @brief KECCAAAAAAAAAAK + * + * Creates constraints that evaluate the Keccak256 hash algorithm. + * + * UltraPlonk only due to heavy lookup table use. + * + * Current cost 17,329 constraints for a 1-block hash + * using small(ish) lookup tables (total size < 2^64) + * + * @tparam Composer + */ +template <typename Composer> class keccak { + public: + using witness_ct = stdlib::witness_t<Composer>; + using field_ct = stdlib::field_t<Composer>; + using byte_array_ct = stdlib::byte_array<Composer>; + + // base of extended representation we use for efficient logic operations + static constexpr uint256_t BASE = 11; + + // number of bits of hash output + static constexpr size_t BITS = 256; + + // word size of hash lane + static constexpr size_t WORD_SIZE = 8; + + // block size. We only support keccak256 with a 1088-bit rate! This is what Ethereum uses + static constexpr size_t BLOCK_SIZE = (1600 - BITS * 2) / WORD_SIZE; + + // how many limbs fit into a block (17) + static constexpr size_t LIMBS_PER_BLOCK = BLOCK_SIZE / 8; + + // round constants. Used in IOTA round + static constexpr std::array<uint64_t, 24> RC = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000, 0x000000000000808b, + 0x0000000080000001, 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000a, 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a, + 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 + }; + + // Rotation offsets, y vertically, x horizontally: r[y * 5 + x] + static constexpr std::array<size_t, 25> ROTATIONS = { + 0, 1, 62, 28, 27, 36, 44, 6, 55, 20, 3, 10, 43, 25, 39, 41, 45, 15, 21, 8, 18, 2, 61, 56, 14, + }; + + /** + * @brief Convert a binary integer into a base11 integer + * + * Input = \sum_{i=0}^63 b_i * 2^i + * Output = \sum_{i=0}^63 b_i * 11^i + * + * @param input + * @return constexpr uint256_t sparse form of input + */ + static constexpr uint256_t convert_to_sparse(uint256_t input) + { + std::array<uint64_t, 64> out_bits; + size_t count = 0; + while (input > 0) { + uint64_t bit = static_cast<uint64_t>(input & 1); + out_bits[count++] = bit; + input = input >> 1; + } + uint256_t output = 0; + for (size_t i = 0; i < count; ++i) { + output *= BASE; + output += out_bits[count - 1 - i]; + } + return output; + }; + + /** + * @brief Normalize a base-11 integer where each base value can be > 1 + * + * Input = \sum_{i=0}^63 b_i * 11^i + * Output = \sum_{i=0}^63 (b_i & 1) * 11^i + * + * (XORs are evaluated by adding integers in sparse-form and normalizing. Even = 0, Odd = 1) + * + * @param input + * @return constexpr uint256_t + */ + static constexpr uint256_t normalize_sparse(uint256_t input) + { + std::array<uint64_t, 64> out_bits; + size_t count = 0; + while (input > 0) { + const auto [quotient, slice] = input.divmod(BASE); + uint64_t bit = static_cast<uint64_t>(slice) & 1; + out_bits[count++] = bit; + input = quotient; + } + uint256_t out; + for (size_t i = 0; i < count; ++i) { + out *= BASE; + out += out_bits[count - 1 - i]; + } + return out; + } + + /** + * @brief Get the sparse round constants object + * + * @return constexpr std::array<uint256_t, 24> + */ + static constexpr std::array<uint256_t, 24> get_sparse_round_constants() + { + std::array<uint256_t, 24> output; + for (size_t i = 0; i < 24; ++i) { + output[i] = convert_to_sparse(RC[i]); + } + return output; + } + static constexpr std::array<uint256_t, 24> SPARSE_RC = get_sparse_round_constants(); + + /** + * @brief Compute the constant offset added in the `Chi` round + * + * We want to compute, for each bit slice of the inputs A, B, C... + * 1 + 2A - B + C + * + * i.e. we need to add the constant value \sum_{i=0}^63 11^i + * + * @return constexpr uint256_t + */ + static constexpr uint256_t get_chi_offset() + { + uint256_t result = 0; + for (size_t i = 0; i < 64; ++i) { + result *= 11; + result += 1; + } + return result; + } + static constexpr uint256_t CHI_OFFSET = get_chi_offset(); + + struct keccak_state { + std::array<field_ct, 25> state; + std::array<field_ct, 25> state_msb; + std::array<field_ct, 25> twisted_state; + Composer* context; + }; + + template <size_t lane_index> static field_t<Composer> normalize_and_rotate(const field_ct& limb, field_ct& msb); + static void compute_twisted_state(keccak_state& internal); + static void theta(keccak_state& state); + static void rho(keccak_state& state); + static void pi(keccak_state& state); + static void chi(keccak_state& state); + static void iota(keccak_state& state, size_t round); + static void sponge_absorb(keccak_state& internal, + const std::vector<field_ct>& input_buffer, + const std::vector<field_ct>& msb_buffer); + static byte_array_ct sponge_squeeze(keccak_state& internal); + static void keccakf1600(keccak_state& state); + static byte_array_ct hash(byte_array_ct& input); + + static std::vector<uint8_t> hash_native(const std::vector<uint8_t>& data) + { + auto hash_result = ethash_keccak256(&data[0], data.size()); + + std::vector<uint8_t> output; + output.resize(32); + + memcpy((void*)&output[0], (void*)&hash_result.word64s[0], 32); + return output; + } +}; + +extern template class keccak<plonk::UltraComposer>; + +} // namespace stdlib +} // namespace plonk diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp new file mode 100644 index 0000000000..793ed5bdf1 --- /dev/null +++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp @@ -0,0 +1,221 @@ +#include "keccak.hpp" +#include <crypto/keccak/keccak.hpp> +#include <gtest/gtest.h> +#include <plonk/composer/ultra_composer.hpp> +#include <numeric/random/engine.hpp> +#include "../../primitives/plookup/plookup.hpp" + +using namespace barretenberg; +using namespace plonk; + +typedef plonk::UltraComposer Composer; +typedef stdlib::byte_array<Composer> byte_array; +typedef stdlib::public_witness_t<Composer> public_witness_t; +typedef stdlib::field_t<Composer> field_ct; +typedef stdlib::witness_t<Composer> witness_ct; + +namespace { +auto& engine = numeric::random::get_debug_engine(); +} + +namespace std { +inline std::ostream& operator<<(std::ostream& os, std::vector<uint8_t> const& t) +{ + os << "[ "; + for (auto e : t) { + os << std::setfill('0') << std::hex << std::setw(2) << (int)e << " "; + } + os << "]"; + return os; +} +} // namespace std + +TEST(stdlib_keccak, keccak_format_input_table) +{ + Composer composer = Composer(); + + for (size_t i = 0; i < 25; ++i) { + uint64_t limb_native = engine.get_random_uint64(); + field_ct limb(witness_ct(&composer, limb_native)); + stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_FORMAT_INPUT, limb); + } + auto prover = composer.create_prover(); + auto verifier = composer.create_verifier(); + + auto proof = prover.construct_proof(); + + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, keccak_format_output_table) +{ + Composer composer = Composer(); + + for (size_t i = 0; i < 25; ++i) { + uint64_t limb_native = engine.get_random_uint64(); + uint256_t extended_native = stdlib::keccak<Composer>::convert_to_sparse(limb_native); + field_ct limb(witness_ct(&composer, extended_native)); + stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_FORMAT_OUTPUT, limb); + } + auto prover = composer.create_prover(); + auto verifier = composer.create_verifier(); + auto proof = prover.construct_proof(); + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, keccak_theta_output_table) +{ + Composer composer = Composer(); + + for (size_t i = 0; i < 25; ++i) { + uint256_t extended_native = 0; + for (size_t j = 0; j < 8; ++j) { + extended_native *= 11; + uint64_t base_value = (engine.get_random_uint64() % 11); + extended_native += base_value; + } + field_ct limb(witness_ct(&composer, extended_native)); + stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_THETA_OUTPUT, limb); + } + auto prover = composer.create_prover(); + auto verifier = composer.create_verifier(); + auto proof = prover.construct_proof(); + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, keccak_rho_output_table) +{ + Composer composer = Composer(); + + barretenberg::constexpr_for<0, 25, 1>([&]<size_t i> { + uint256_t extended_native = 0; + uint256_t binary_native = 0; + for (size_t j = 0; j < 64; ++j) { + extended_native *= 11; + binary_native = binary_native << 1; + uint64_t base_value = (engine.get_random_uint64() % 3); + extended_native += base_value; + binary_native += (base_value & 1); + } + const size_t left_bits = stdlib::keccak<Composer>::ROTATIONS[i]; + const size_t right_bits = 64 - left_bits; + const uint256_t left = binary_native >> right_bits; + const uint256_t right = binary_native - (left << right_bits); + const uint256_t binary_rotated = left + (right << left_bits); + + const uint256_t expected_limb = stdlib::keccak<Composer>::convert_to_sparse(binary_rotated); + // msb only is correct iff rotation == 0 (no need to get msb for rotated lookups) + const uint256_t expected_msb = (binary_native >> 63); + field_ct limb(witness_ct(&composer, extended_native)); + field_ct result_msb; + field_ct result_limb = stdlib::keccak<Composer>::normalize_and_rotate<i>(limb, result_msb); + EXPECT_EQ(static_cast<uint256_t>(result_limb.get_value()), expected_limb); + EXPECT_EQ(static_cast<uint256_t>(result_msb.get_value()), expected_msb); + }); + + std::cout << "create prover?" << std::endl; + auto prover = composer.create_prover(); + std::cout << "create verifier" << std::endl; + printf("composer gates = %zu\n", composer.get_num_gates()); + auto verifier = composer.create_verifier(); + std::cout << "make proof" << std::endl; + auto proof = prover.construct_proof(); + std::cout << "verify proof" << std::endl; + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, keccak_chi_output_table) +{ + static constexpr uint64_t chi_normalization_table[5]{ + 0, // 1 + 2a - b + c => a xor (~b & c) + 0, 1, 1, 0, + }; + Composer composer = Composer(); + + for (size_t i = 0; i < 25; ++i) { + uint256_t normalized_native = 0; + uint256_t extended_native = 0; + uint256_t binary_native = 0; + for (size_t j = 0; j < 8; ++j) { + extended_native *= 11; + normalized_native *= 11; + binary_native = binary_native << 1; + uint64_t base_value = (engine.get_random_uint64() % 5); + extended_native += base_value; + normalized_native += chi_normalization_table[base_value]; + binary_native += chi_normalization_table[base_value]; + } + field_ct limb(witness_ct(&composer, extended_native)); + const auto accumulators = stdlib::plookup_read::get_lookup_accumulators(plookup::KECCAK_CHI_OUTPUT, limb); + + field_ct normalized = accumulators[plookup::ColumnIdx::C2][0]; + field_ct msb = accumulators[plookup::ColumnIdx::C3][accumulators[plookup::ColumnIdx::C3].size() - 1]; + + EXPECT_EQ(static_cast<uint256_t>(normalized.get_value()), normalized_native); + EXPECT_EQ(static_cast<uint256_t>(msb.get_value()), binary_native >> 63); + } + printf("composer gates = %zu\n", composer.get_num_gates()); + auto prover = composer.create_prover(); + auto verifier = composer.create_verifier(); + std::cout << "make proof" << std::endl; + auto proof = prover.construct_proof(); + std::cout << "verify proof" << std::endl; + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, test_single_block) +{ + Composer composer = Composer(); + std::string input = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz01"; + std::vector<uint8_t> input_v(input.begin(), input.end()); + + byte_array input_arr(&composer, input_v); + byte_array output = stdlib::keccak<Composer>::hash(input_arr); + + std::vector<uint8_t> expected = stdlib::keccak<Composer>::hash_native(input_v); + + EXPECT_EQ(output.get_value(), expected); + + composer.print_num_gates(); + + auto prover = composer.create_prover(); + std::cout << "prover circuit_size = " << prover.key->circuit_size << std::endl; + auto verifier = composer.create_verifier(); + + auto proof = prover.construct_proof(); + + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +} + +TEST(stdlib_keccak, test_double_block) +{ + Composer composer = Composer(); + std::string input = ""; + for (size_t i = 0; i < 200; ++i) { + input += "a"; + } + std::vector<uint8_t> input_v(input.begin(), input.end()); + + byte_array input_arr(&composer, input_v); + byte_array output = stdlib::keccak<Composer>::hash(input_arr); + + std::vector<uint8_t> expected = stdlib::keccak<Composer>::hash_native(input_v); + + EXPECT_EQ(output.get_value(), expected); + + composer.print_num_gates(); + + auto prover = composer.create_prover(); + auto verifier = composer.create_verifier(); + + auto proof = prover.construct_proof(); + + bool proof_result = verifier.verify_proof(proof); + EXPECT_EQ(proof_result, true); +}