From 05a61024fb9ff7f82c130e4fbf915555c1c18a1a Mon Sep 17 00:00:00 2001
From: Zachary James Williamson <blorktronics@gmail.com>
Date: Wed, 1 Mar 2023 18:59:37 +0000
Subject: [PATCH] implemented keccak hash in stdlib (#114)

* implemented keccak hash in stdlib

* making some minor naming fixes after rebase

---------

Co-authored-by: ledwards2225 <l.edwards.d@gmail.com>
---
 cpp/src/aztec/common/constexpr_utils.hpp      | 189 ++++++
 .../plookup_tables/keccak/keccak_chi.hpp      | 253 +++++++
 .../plookup_tables/keccak/keccak_input.hpp    | 144 ++++
 .../plookup_tables/keccak/keccak_output.hpp   | 175 +++++
 .../plookup_tables/keccak/keccak_rho.hpp      | 296 +++++++++
 .../plookup_tables/keccak/keccak_theta.hpp    | 255 +++++++
 .../plookup_tables/plookup_tables.cpp         |  14 +
 .../plookup_tables/plookup_tables.hpp         |  42 ++
 .../plonk/composer/plookup_tables/types.hpp   |  21 +-
 .../aztec/plonk/composer/ultra_composer.cpp   |  14 +
 .../aztec/plonk/composer/ultra_composer.hpp   |  29 +-
 cpp/src/aztec/stdlib/hash/CMakeLists.txt      |   3 +-
 .../aztec/stdlib/hash/keccak/CMakeLists.txt   |   1 +
 cpp/src/aztec/stdlib/hash/keccak/keccak.cpp   | 620 ++++++++++++++++++
 cpp/src/aztec/stdlib/hash/keccak/keccak.hpp   | 189 ++++++
 .../aztec/stdlib/hash/keccak/keccak.test.cpp  | 221 +++++++
 16 files changed, 2462 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/aztec/common/constexpr_utils.hpp
 create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp
 create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp
 create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp
 create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp
 create mode 100644 cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp
 create mode 100644 cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt
 create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.cpp
 create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.hpp
 create mode 100644 cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp

diff --git a/cpp/src/aztec/common/constexpr_utils.hpp b/cpp/src/aztec/common/constexpr_utils.hpp
new file mode 100644
index 0000000000..08afbcd200
--- /dev/null
+++ b/cpp/src/aztec/common/constexpr_utils.hpp
@@ -0,0 +1,189 @@
+#pragma once
+
+#include <stddef.h>
+#include <utility>
+#include <tuple>
+
+/**
+ * @brief constexpr_utils defines some helper methods that perform some stl-equivalent operations
+ * but in a constexpr context over quantities known at compile-time
+ *
+ * Current methods are:
+ *
+ * constexpr_for : loop over a range , where the size_t iterator `i` is a constexpr variable
+ * constexpr_find : find if an element is in an array
+ * concatenate_arrays : smoosh multiple std::array objects into a single std::array
+ *
+ */
+namespace barretenberg {
+
+/**
+ * @brief Implements a loop using a compile-time iterator. Requires c++20.
+ * Implementation (and description) from https://artificial-mind.net/blog/2020/10/31/constexpr-for
+ *
+ * @tparam Start the loop start value
+ * @tparam End the loop end value
+ * @tparam Inc how much the iterator increases by per iteration
+ * @tparam F a Lambda function that is executed once per loop
+ *
+ * @param f An rvalue reference to the lambda
+ * @details Implements a `for` loop where the iterator is a constexpr variable.
+ * Use this when you need to evaluate `if constexpr` statements on the iterator (or apply other constexpr expressions)
+ * Outside of this use-case avoid using this fn as it gives negligible performance increases vs regular loops.
+ *
+ * N.B. A side-effect of this method is that all loops will be unrolled
+ * (each loop iteration uses different iterator template parameters => unique constexpr_for implementation per
+ * iteration)
+ * Do not use this for large (~100+) loops!
+ *
+ * ##############################
+ * EXAMPLE USE OF `constexpr_for`
+ * ##############################
+ *
+ * constexpr_for<0, 10, 1>([&]<size_t i>(){
+ *  if constexpr (i & 1 == 0)
+ *  {
+ *      foo[i] = even_container[i >> 1];
+ *  }
+ *  else
+ *  {
+ *      foo[i] = odd_container[i >> 1];
+ *  }
+ * });
+ *
+ * In the above example we are iterating from i = 0 to i < 10.
+ * The provided lambda function has captured everything in its surrounding scope (via `[&]`),
+ * which is where `foo`, `even_container` and `odd_container` have come from.
+ *
+ * We do not need to explicitly define the `class F` parameter as the compiler derives it from our provided input
+ * argument `F&& f` (i.e. the lambda function)
+ *
+ * In the loop itself we're evaluating a constexpr if statement that defines which code path is taken.
+ *
+ * The above example benefits from `constexpr_for` because a run-time `if` statement has been reduced to a compile-time
+ * `if` statement. N.B. this would only give measurable improvements if the `constexpr_for` statement is itself in a hot
+ * loop that's iterated over many (>thousands) times
+ */
+template <size_t Start, size_t End, size_t Inc, class F> constexpr void constexpr_for(F&& f)
+{
+    // Call function `f<Start>()` iff Start < End
+    if constexpr (Start < End) {
+        // F must be a template lambda with a single **typed** template parameter that represents the iterator
+        // (e.g. [&]<size_t i>(){ ... } is good)
+        // (and [&]<typename i>(){ ... } won't compile!)
+
+        /**
+         * Explaining f.template operator()<Start>()
+         *
+         * The following line must explicitly tell the compiler that <Start> is a template parameter by using the
+         * `template` keyword.
+         * (if we wrote f<Start>(), the compiler could legitimately interpret `<` as a less than symbol)
+         *
+         * The fragment `f.template` tells the compiler that we're calling a *templated* member of `f`.
+         * The "member" being called is the function operator, `operator()`, which must be explicitly provided
+         * (for any function X, `X(args)` is an alias for `X.operator()(args)`)
+         * The compiler has no alias `X.template <tparam>(args)` for `X.template operator()<tparam>(args)` so we must
+         * write it explicitly here
+         *
+         * To summarise what the next line tells the compiler...
+         * 1. I want to call a member of `f` that expects one or more template parameters
+         * 2. The member of `f` that I want to call is the function operator
+         * 3. The template parameter is `Start`
+         * 4. The funtion operator itself contains no arguments
+         */
+        f.template operator()<Start>();
+
+        // Once we have executed `f`, we recursively call the `constexpr_for` function, increasing the value of `Start`
+        // by `Inc`
+        constexpr_for<Start + Inc, End, Inc>(f);
+    }
+}
+
+/**
+ * @brief returns true/false depending on whether `key` is in `container`
+ *
+ * @tparam container i.e. what are we looking in?
+ * @tparam key i.e. what are we looking for?
+ * @return true found!
+ * @return false not found!
+ *
+ * @details method is constexpr and can be used in static_asserts
+ */
+template <const auto& container, auto key> constexpr bool constexpr_find()
+{
+    // using ElementType = typename std::remove_extent<ContainerType>::type;
+    bool found = false;
+    constexpr_for<0, container.size(), 1>([&]<size_t k>() {
+        if constexpr (std::get<k>(container) == key) {
+            found = true;
+        }
+    });
+    return found;
+}
+
+/**
+ * @brief merges multiple std::arrays into a single array.
+ * Array lengths can be different but array type must match
+ * Method is constexpr and should concat constexpr arrays at compile time
+ *
+ * @tparam Type the array type
+ * @tparam sizes template parameter pack of size_t value params
+ * @param arrays
+ * @return constexpr auto
+ *
+ * @details template params should be autodeducted. Example use case:
+ *
+ * ```
+ *   std::array<int, 2> a{1, 2};
+ *   std::array<int, 3> b{1,3, 5};
+ *   std::array<int, 5> c = concatenate(a, b);
+ * ```
+ */
+template <typename Type, std::size_t... sizes>
+constexpr auto concatenate_arrays(const std::array<Type, sizes>&... arrays)
+{
+    return std::apply([](auto... elems) -> std::array<Type, (sizes + ...)> { return { { elems... } }; },
+                      std::tuple_cat(std::tuple_cat(arrays)...));
+}
+
+/**
+ * @brief Create a constexpr array object whose elements contain a default value
+ *
+ * @tparam T type contained in the array
+ * @tparam Is index sequence
+ * @param value the value each array element is being initialized to
+ * @return constexpr std::array<T, sizeof...(Is)>
+ *
+ * @details This method is used to create constexpr arrays whose encapsulated type:
+ *
+ * 1. HAS NO CONSTEXPR DEFAULT CONSTRUCTOR
+ * 2. HAS A CONSTEXPR COPY CONSTRUCTOR
+ *
+ * An example of this is barretenberg::field_t
+ * (the default constructor does not default assign values to the field_t member variables for efficiency reasons, to
+ * reduce the time require to construct large arrays of field elements. This means the default constructor for field_t
+ * cannot be constexpr)
+ */
+template <typename T, std::size_t... Is>
+constexpr std::array<T, sizeof...(Is)> create_array(T value, std::index_sequence<Is...>)
+{
+    // cast Is to void to remove the warning: unused value
+    std::array<T, sizeof...(Is)> result = { { (static_cast<void>(Is), value)... } };
+    return result;
+}
+
+/**
+ * @brief Create a constexpr array object whose values all are 0
+ *
+ * @tparam T
+ * @tparam N
+ * @return constexpr std::array<T, N>
+ *
+ * @details Use in the same context as create_array, i.e. when encapsulated type has a default constructor that is not
+ * constexpr
+ */
+template <typename T, size_t N> constexpr std::array<T, N> create_empty_array()
+{
+    return create_array(T(0), std::make_index_sequence<N>());
+}
+}; // namespace barretenberg
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp
new file mode 100644
index 0000000000..e7245a665d
--- /dev/null
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_chi.hpp
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <numeric/bitop/pow.hpp>
+#include <common/constexpr_utils.hpp>
+#include "../types.hpp"
+
+namespace plookup {
+namespace keccak_tables {
+
+/**
+ * @brief Generates plookup tables required for CHI round of Keccak hash function
+ *
+ * Keccak has 25 hash lanes, each represented as 64-bit integers. The CHI round performs the following operation on 3
+ * hash lanes:
+ *
+ * A ^ (~B & C)
+ *
+ * We evaluate in-circuit using a base-11 sparse integer representation:
+ *
+ * P = \sum_{j=0}^63 b_i * 11^i
+ *
+ * In this representation we evaluate CHI via the linear expression
+ *
+ * 2.A - B + C + Q
+ *
+ * Where Q is the precomputed constant \sum_{i=0}^63 11^i
+ *
+ * This can be represented via the 'truth table' for each base-11 quasi-bit:
+ *
+ * | A | B | C | Algebraic Output |
+ * | - | - | - | ---------------- |
+ * | 0 | 0 | 0 | 1                |
+ * | 0 | 0 | 1 | 2                |
+ * | 0 | 1 | 0 | 0                |
+ * | 1 | 0 | 0 | 3                |
+ * | 1 | 0 | 1 | 4                |
+ * | 1 | 1 | 0 | 2                |
+ * | 1 | 1 | 1 | 3                |
+ *
+ * CHI round uses a plookup table that normalizes the algebraic output back into the binary output.
+ *
+ * | Algebraic Output | Binary Output |
+ * | ---------------- | ------------- |
+ * | 0                | 0             |
+ * | 1                | 0             |
+ * | 2                | 1             |
+ * | 3                | 1             |
+ * | 4                | 0             |
+ *
+ * In addition we also use the CHI lookup table to determine the value of the most significant (63rd) bit of the output
+ *
+ * for all M in [0, ..., TABLE_BITS - 1] and K in [0, 1, 2, 3, 4], the column values of our lookup table are:
+ *
+ * Column1 value = \sum_{i \in M} \sum_{j \in K} 11^i * j]
+ * Column2 value = \sum_{i \in M} \sum_{j \in K} 11^i * CHI_NORMALIZATION_TABLE[j]]
+ * Column3 value = Column2 / 11^8
+ *
+ */
+class Chi {
+  public:
+    // 1 + 2a - b + c => a xor (~b & c)
+    static constexpr uint64_t CHI_NORMALIZATION_TABLE[5]{
+        0, 0, 1, 1, 0,
+    };
+
+    static constexpr uint64_t BASE = 11;
+
+    // effective base = maximum value each input 'quasi-bit' can reach at this stage of the Keccak algo
+    // (we use base11 as it's a bit more efficient to use a consistent base across the entire Keccak hash algorithm.
+    //  The THETA round requires base-11 in order to most efficiently convert XOR operations into algebraic operations)
+    static constexpr uint64_t EFFECTIVE_BASE = 5;
+
+    // TABLE_BITS defines table size. More bits = fewer lookups but larger tables!
+    static constexpr uint64_t TABLE_BITS = 6;
+
+    /**
+     * @brief Given a table input value, return the table output value
+     *
+     * Used by the Plookup code to precompute lookup tables and generate witness values
+     *
+     * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value)
+     * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^8)
+     */
+    static std::array<barretenberg::fr, 2> get_chi_renormalization_values(const std::array<uint64_t, 2> key)
+    {
+        uint64_t accumulator = 0;
+        uint64_t input = key[0];
+        uint64_t base_shift = 1;
+        constexpr uint64_t divisor_exponent = (64 % TABLE_BITS == 0) ? (TABLE_BITS - 1) : (64 % TABLE_BITS) - 1;
+        constexpr uint64_t divisor = numeric::pow64(BASE, divisor_exponent);
+        while (input > 0) {
+            uint64_t slice = input % BASE;
+            uint64_t bit = CHI_NORMALIZATION_TABLE[static_cast<size_t>(slice)];
+            accumulator += (bit * base_shift);
+            input /= BASE;
+            base_shift *= BASE;
+        }
+
+        return { barretenberg::fr(accumulator), barretenberg::fr(accumulator / divisor) };
+    }
+
+    /**
+     * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1])
+     * Code is slightly faster at runtime if we compute this at compile time
+     *
+     * @return constexpr std::array<uint64_t, TABLE_BITS>
+     */
+    static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases()
+    {
+        std::array<uint64_t, TABLE_BITS> result;
+        uint64_t acc = 1;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            result[i] = acc;
+            acc *= BASE;
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column values for next row of plookup table. Used to generate plookup table row values
+     *
+     * Input `counts` is an array of quasi-bits that represent the current row.
+     * Method increases `counts` by 1 and returns the plookup table column values.
+     *
+     * (a bit tricky to compute because each quasi-bit ranges from [0, 1, 2, 3, 4],
+     *  but we're working with base-11 numbers.
+     *  i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!)
+     *
+     * @param counts The current row value represented as an array of quasi-bits
+     * @return std::array<uint64_t, 3> the columns of the plookup table
+     */
+    static std::array<uint64_t, 3> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts)
+    {
+        static constexpr auto scaled_bases = get_scaled_bases();
+
+        // want the most significant bit of the 64-bit integer, when this table is used on the most significant slice
+        constexpr uint64_t divisor_exponent = (64 % TABLE_BITS == 0) ? (TABLE_BITS - 1) : (64 % TABLE_BITS) - 1;
+        constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), divisor_exponent);
+
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            if (counts[i] == EFFECTIVE_BASE - 1) {
+                counts[i] = 0;
+            } else {
+                counts[i] += 1;
+                break;
+            }
+        }
+
+        uint64_t value = 0;
+        uint64_t normalized_value = 0;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            value += counts[i] * scaled_bases[i];
+            normalized_value += (CHI_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i];
+        }
+        return { value, normalized_value, normalized_value / divisor };
+    }
+
+    /**
+     * @brief Generate the CHI plookup table
+     *
+     * This table is used by Composer objects to generate plookup constraints
+     *
+     * @param id a compile-time ID defined via plookup_tables.hpp
+     * @param table_index a circuit-specific ID for the table used by the circuit Composer
+     * @return BasicTable
+     */
+    static BasicTable generate_chi_renormalization_table(BasicTableId id, const size_t table_index)
+    {
+        BasicTable table;
+        table.id = id;
+        table.table_index = table_index;
+        table.use_twin_keys = false;
+        table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS);
+
+        std::array<size_t, TABLE_BITS> counts{};
+        std::array<uint64_t, 3> column_values{ 0, 0, 0 };
+        for (size_t i = 0; i < table.size; ++i) {
+            table.column_1.emplace_back(column_values[0]);
+            table.column_2.emplace_back(column_values[1]);
+            table.column_3.emplace_back(column_values[2]);
+            column_values = get_column_values_for_next_row(counts);
+        }
+
+        table.get_values_from_key = &get_chi_renormalization_values;
+
+        constexpr uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS);
+        table.column_1_step_size = barretenberg::fr(step_size);
+        table.column_2_step_size = barretenberg::fr(step_size);
+        table.column_3_step_size = barretenberg::fr(0);
+        return table;
+    }
+
+    /**
+     * @brief Create the CHI MultiTable used by plookup to generate a sequence of lookups
+     *
+     * The CHI round operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits.
+     *
+     * i.e. multiple lookups are required for a single 64-bit integer.
+     *
+     * If we group these lookups together, we can derive the plookup column values
+     * from the relative difference between wire values.
+     *
+     * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the
+     * output slices
+     *
+     * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where
+     *      A = \sum_{i=0}^7 A_i * 11^8
+     *      B = \sum_{i=0}^7 B_i * 11^8
+     *      C_i = B_i / 11^8 (to get the most significant bit of B)
+     *
+     * Our plookup gates will produce a gates with the following wire values:
+     *
+     * | W1                      | W2                      | W3  |
+     * | ----------------------- | ----------------------- | --- |
+     * | \sum_{i=0}^7 A_i * 11^i | \sum_{i=0}^7 B_i * 11^i | C_0 |
+     * | \sum_{i=1}^7 A_i * 11^i | \sum_{i=1}^7 B_i * 11^i | C_1 |
+     * | \sum_{i=2}^7 A_i * 11^i | \sum_{i=2}^7 B_i * 11^i | C_2 |
+     * | ...                     | ...                     | ... |
+     * | A^7                     | B^7                     | C^7 |
+     *
+     * The plookup protocol extracts the 1st and 2nd lookup column values by taking:
+     *
+     *      Colunn1 = W1[i] - 11^8 . W1[i + 1]
+     *      Colunn2 = W2[i] - 11^8 . W2[i + 1]
+     *
+     * (where the -11^8 coefficient is stored in a precomputed selector polynomial)
+     *
+     * This MultiTable construction defines the value of these precomputed selector polynomial values,
+     * as well as defines how the column values are derived from a starting input value.
+     *
+     * @param id
+     * @return MultiTable
+     */
+    static MultiTable get_chi_output_table(const MultiTableId id = KECCAK_CHI_OUTPUT)
+    {
+        constexpr size_t num_tables_per_multitable =
+            (64 / TABLE_BITS) + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 8 bits per entry
+
+        // column_multiplier is used to define the gap between rows when deriving colunn values via relative differences
+        uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS);
+        MultiTable table(column_multiplier, column_multiplier, 0, num_tables_per_multitable);
+
+        table.id = id;
+        for (size_t i = 0; i < num_tables_per_multitable; ++i) {
+            table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS));
+            table.lookup_ids.emplace_back(KECCAK_CHI);
+            table.get_table_values.emplace_back(&get_chi_renormalization_values);
+        }
+        return table;
+    }
+};
+} // namespace keccak_tables
+} // namespace plookup
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp
new file mode 100644
index 0000000000..5da5c79074
--- /dev/null
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_input.hpp
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <numeric/bitop/pow.hpp>
+#include <numeric/bitop/sparse_form.hpp>
+#include <common/constexpr_utils.hpp>
+#include "../types.hpp"
+
+namespace plookup {
+namespace keccak_tables {
+
+/**
+ * @brief Generates plookup tables used convert 64-bit integers into a sparse representation used for Keccak hash
+ * algorithm
+ *
+ * Keccak has 25 hash lanes, each represented as 64-bit integers.
+ *
+ * We evaluate in-circuit using a base-11 sparse integer representation for each lane:
+ *
+ * P = \sum_{j=0}^63 b_i * 11^i
+ *
+ * KeccakInput defines the plookup table that maps binary integer slices into base-11 integer slices.
+ *
+ * In addition, KeccakInput also is used to determine the value of the most significant (63rd) bit of the input
+ * (which is used by stdlib::keccak to more efficiently left-rotate by 1 bit)
+ */
+class KeccakInput {
+
+  public:
+    static constexpr uint64_t BASE = 11;
+    static constexpr size_t TABLE_BITS = 8;
+
+    /**
+     * @brief Given a table input value, return the table output value
+     *
+     * Used by the Plookup code to precompute lookup tables and generate witness values
+     *
+     * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value)
+     * @return std::array<barretenberg::fr, 2> table output
+     */
+    static std::array<barretenberg::fr, 2> get_keccak_input_values(const std::array<uint64_t, 2> key)
+    {
+        const uint256_t t0 = numeric::map_into_sparse_form<BASE>(key[0]);
+
+        constexpr size_t msb_shift = (64 % TABLE_BITS == 0) ? TABLE_BITS - 1 : (64 % TABLE_BITS) - 1;
+        const uint256_t t1 = key[0] >> msb_shift;
+        return { barretenberg::fr(t0), barretenberg::fr(t1) };
+    }
+
+    /**
+     * @brief Generate plookup table that maps a TABLE_BITS-slice of a base-2 integer into a base-11 representation
+     *
+     * @param id
+     * @param table_index
+     * @return BasicTable
+     */
+    static BasicTable generate_keccak_input_table(BasicTableId id, const size_t table_index)
+    {
+        BasicTable table;
+        table.id = id;
+        table.table_index = table_index;
+        table.size = (1U << TABLE_BITS);
+        table.use_twin_keys = false;
+        constexpr size_t msb_shift = (64 % TABLE_BITS == 0) ? TABLE_BITS - 1 : (64 % TABLE_BITS) - 1;
+
+        for (uint64_t i = 0; i < table.size; ++i) {
+            const uint64_t source = i;
+            const auto target = numeric::map_into_sparse_form<BASE>(source);
+            table.column_1.emplace_back(barretenberg::fr(source));
+            table.column_2.emplace_back(barretenberg::fr(target));
+            table.column_3.emplace_back(barretenberg::fr(source >> msb_shift));
+        }
+
+        table.get_values_from_key = &get_keccak_input_values;
+
+        uint256_t sparse_step_size = 1;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            sparse_step_size *= BASE;
+        }
+        table.column_1_step_size = barretenberg::fr((1 << TABLE_BITS));
+        table.column_2_step_size = barretenberg::fr(sparse_step_size);
+        table.column_3_step_size = barretenberg::fr(sparse_step_size);
+
+        return table;
+    }
+
+    /**
+     * @brief Create the KeccakInput MultiTable used by plookup to generate a sequence of lookups
+     *
+     * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits.
+     *
+     * i.e. multiple lookups are required for a single 64-bit integer.
+     *
+     * If we group these lookups together, we can derive the plookup column values
+     * from the relative difference between wire values.
+     *
+     * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the
+     * output slices
+     *
+     * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where
+     *      A = \sum_{i=0}^7 A_i * 2^8
+     *      B = \sum_{i=0}^7 B_i * 11^8
+     *      C_i = B_i >> 7 (to get the most significant bit of B)
+     *
+     * Our plookup gates will produce a gates with the following wire values:
+     *
+     * | W1                      | W2                      | W3  |
+     * | ----------------------- | ----------------------- | --- |
+     * | \sum_{i=0}^7 A_i * 2^i  | \sum_{i=0}^7 B_i * 11^i | C_0 |
+     * | \sum_{i=1}^7 A_i * 2^i  | \sum_{i=1}^7 B_i * 11^i | C_1 |
+     * | \sum_{i=2}^7 A_i * 2^i  | \sum_{i=2}^7 B_i * 11^i | C_2 |
+     * | ...                     | ...                     | ... |
+     * | A^7                     | B^7                     | C^7 |
+     *
+     * The plookup protocol extracts the 1st and 2nd lookup column values by taking:
+     *
+     *      Colunn1 = W1[i] - 2^8 . W1[i + 1]
+     *      Colunn2 = W2[i] - 11^8 . W2[i + 1]
+     *
+     * (where the -11^8 coefficient is stored in a precomputed selector polynomial)
+     *
+     * This MultiTable construction defines the value of these precomputed selector polynomial values,
+     * as well as defines how the column values are derived from a starting input value.
+     *
+     * @param id
+     * @return MultiTable
+     */
+    static MultiTable get_keccak_input_table(const MultiTableId id = KECCAK_FORMAT_INPUT)
+    {
+        const size_t num_entries = 8;
+
+        MultiTable table(1 << 8, uint256_t(11).pow(8), 0, num_entries);
+
+        table.id = id;
+        for (size_t i = 0; i < num_entries; ++i) {
+            table.slice_sizes.emplace_back(1 << 8);
+            table.lookup_ids.emplace_back(KECCAK_INPUT);
+            table.get_table_values.emplace_back(&get_keccak_input_values);
+        }
+        return table;
+    }
+};
+
+} // namespace keccak_tables
+} // namespace plookup
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp
new file mode 100644
index 0000000000..4f5fcdd0f7
--- /dev/null
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_output.hpp
@@ -0,0 +1,175 @@
+#pragma once
+
+#include <numeric/bitop/pow.hpp>
+#include <numeric/bitop/sparse_form.hpp>
+#include <common/constexpr_utils.hpp>
+
+#include "../types.hpp"
+#include "../sparse.hpp"
+
+namespace plookup {
+namespace keccak_tables {
+
+/**
+ * @brief Converts a base-11 sparse integer representation into a regular base-2 binary integer.
+ *        Used by the Keccak hash algorithm to convert the output of the algorithm into a regular integer.
+ */
+class KeccakOutput {
+
+  public:
+    static constexpr uint64_t BASE = 11;
+
+    // effective base = maximum value each input 'quasi-bit' can reach
+    // (the input is in base-11 representation, but at this point in the Keccak algorithm each 'quasi-bit' can only
+    // take values [0, 1] not [0, ..., 10]
+    static constexpr uint64_t EFFECTIVE_BASE = 2;
+    static constexpr size_t TABLE_BITS = 8;
+
+    static constexpr uint64_t OUTPUT_NORMALIZATION_TABLE[2]{ 0, 1 };
+
+    /**
+     * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1])
+     * Code is slightly faster at runtime if we compute this at compile time
+     *
+     * @return constexpr std::array<uint64_t, TABLE_BITS>
+     */
+    static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases()
+    {
+        std::array<uint64_t, TABLE_BITS> result;
+        uint64_t acc = 1;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            result[i] = acc;
+            acc *= BASE;
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column values for next row of plookup table. Used to generate plookup table row values
+     *
+     * Input `counts` is an array of quasi-bits that represent the current row.
+     * Method increases `counts` by 1 and returns the plookup table column values.
+     *
+     * (a bit tricky to compute because each quasi-bit ranges from [0, 1],
+     *  but we're working with base-11 numbers.
+     *  i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!)
+     *
+     * @param counts The current row value represented as an array of quasi-bits
+     * @return std::array<uint64_t, uint64_t> first and second columns of plookup table (3rd column is 0)
+     */
+    static std::array<uint64_t, 2> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts)
+    {
+        static constexpr auto scaled_bases = get_scaled_bases();
+
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            if (counts[i] == EFFECTIVE_BASE - 1) {
+                counts[i] = 0;
+            } else {
+                counts[i] += 1;
+                break;
+            }
+        }
+
+        uint64_t value = 0;
+        uint64_t normalized_value = 0;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            value += counts[i] * scaled_bases[i];
+            normalized_value += (OUTPUT_NORMALIZATION_TABLE[counts[i]]) << i;
+        }
+        return { value, normalized_value };
+    }
+
+    /**
+     * @brief Generate plookup table that maps a TABLE_BITS-slice of a base-11 integer into a base-2 integer
+     *
+     * @param id
+     * @param table_index
+     * @return BasicTable
+     */
+    static BasicTable generate_keccak_output_table(BasicTableId id, const size_t table_index)
+    {
+        BasicTable table;
+        table.id = id;
+        table.table_index = table_index;
+        table.use_twin_keys = false;
+        table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS);
+
+        std::array<size_t, TABLE_BITS> counts{};
+        std::array<uint64_t, 2> column_values{ 0, 0 };
+
+        for (size_t i = 0; i < table.size; ++i) {
+            table.column_1.emplace_back(column_values[0]);
+            table.column_2.emplace_back(column_values[1]);
+            table.column_3.emplace_back(0);
+            column_values = get_column_values_for_next_row(counts);
+        }
+
+        table.get_values_from_key = &sparse_tables::get_sparse_normalization_values<BASE, OUTPUT_NORMALIZATION_TABLE>;
+
+        table.column_1_step_size = barretenberg::fr(numeric::pow64(static_cast<size_t>(BASE), TABLE_BITS));
+        table.column_2_step_size = barretenberg::fr(((uint64_t)1 << TABLE_BITS));
+        table.column_3_step_size = 0;
+        return table;
+    }
+
+    /**
+     * @brief Create the KeccakOutput MultiTable used by plookup to generate a sequence of lookups
+     *
+     * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits.
+     *
+     * i.e. multiple lookups are required for a single 64-bit integer.
+     *
+     * If we group these lookups together, we can derive the plookup column values
+     * from the relative difference between wire values.
+     *
+     * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the
+     * output slices
+     *
+     * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B where
+     *      A = \sum_{i=0}^7 A_i * 11^8
+     *      B = \sum_{i=0}^7 B_i * 2^8
+     *
+     * Our plookup gates will produce a gates with the following wire values:
+     *
+     * | W1                      | W2                      | W3  |
+     * | ----------------------- | ----------------------- | --- |
+     * | \sum_{i=0}^7 A_i * 2^i  | \sum_{i=0}^7 B_i * 11^i | 0   |
+     * | \sum_{i=1}^7 A_i * 2^i  | \sum_{i=1}^7 B_i * 11^i | 0   |
+     * | \sum_{i=2}^7 A_i * 2^i  | \sum_{i=2}^7 B_i * 11^i | 0   |
+     * | ...                     | ...                     | ... |
+     * | A^7                     | B^7                     | 0   |
+     *
+     * The plookup protocol extracts the 1st and 2nd lookup column values by taking:
+     *
+     *      Colunn1 = W1[i] - 11^8 . W1[i + 1]
+     *      Colunn2 = W2[i] - 2^8 . W2[i + 1]
+     *
+     * (where the -11^8, -2^8 coefficients are stored in a precomputed selector polynomial)
+     *
+     * This MultiTable construction defines the value of these precomputed selector polynomial values,
+     * as well as defines how the column values are derived from a starting input value.
+     *
+     * @param id
+     * @return MultiTable
+     */
+    static MultiTable get_keccak_output_table(const MultiTableId id = KECCAK_FORMAT_OUTPUT)
+    {
+        constexpr size_t num_tables_per_multitable =
+            64 / TABLE_BITS + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 8 bits per entry
+
+        uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS);
+        MultiTable table(column_multiplier, (1ULL << TABLE_BITS), 0, num_tables_per_multitable);
+
+        table.id = id;
+        for (size_t i = 0; i < num_tables_per_multitable; ++i) {
+            table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS));
+            table.lookup_ids.emplace_back(KECCAK_OUTPUT);
+            table.get_table_values.emplace_back(
+                &sparse_tables::get_sparse_normalization_values<BASE, OUTPUT_NORMALIZATION_TABLE>);
+        }
+        return table;
+    }
+};
+
+} // namespace keccak_tables
+} // namespace plookup
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp
new file mode 100644
index 0000000000..e7bd5ad223
--- /dev/null
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_rho.hpp
@@ -0,0 +1,296 @@
+#pragma once
+
+#include <numeric/bitop/pow.hpp>
+#include <common/constexpr_utils.hpp>
+#include "../types.hpp"
+
+namespace plookup {
+namespace keccak_tables {
+
+/**
+ * @brief Generate the plookup tables used for the RHO round of the Keccak hash algorithm
+ *
+ * Keccak has 25 hash lanes, each represented as 64-bit integers.
+ * The RHO round performs a left-rotation on each lane, by a fixed rotation value defined by the ROTATIONS arrray
+ *
+ * We evaluate Keccak in-circuit using a base-11 sparse integer representation of each hash lane:
+ *
+ * P = \sum_{j=0}^63 b_i * 11^i
+ *
+ * This allows us to replace binary operations (XOR, AND) with algebraic ones, combined with plookup tables.
+ * (see keccak_chi.hpp for comments on this).
+ *
+ * At this point in the algorithm, each hash lane 'quasi-bit' is in the range [0, 1, 2].
+ *
+ * The RHO lookup tables are used to perform the following:
+ *
+ * 1. Normalize each quasi-bit so that P_out = \sum_{j=0}^63 (b_i mod 2) * 11^i
+ * 2. Perform a left-rotation whose value is defined by a value in the ROTATIONS array
+ * 3. Extract the most significant bit of the non-rotated normalized output
+ *
+ * The most significant bit component is required because we use this table to
+ * normalize XOR operations in the IOTA round and the `sponge_absorb` phase of the algorighm.
+ * (Both IOTA and `sponge_absorb` are followed by the THETA round which requires the msb of each hash lane)
+ *
+ * Rotations are performed by splitting the input into 'left' and 'right' bit slices
+ * (left slice = bits that wrap around the 11^64 modulus of our base-11 integers)
+ * (right slice = bits that don't wrap)
+ *
+ * Both slices are fed into a Rho plookup table. The outputs are stitched together to produce the rotated value.
+ *
+ * We need multiple Rho tables in order to efficiently range-constrain our input slices.
+ *
+ * The maximum number of bits we can accomodate in this lookup table is MAXIMUM_MULTITABLE_BITS (assume this is 8)
+ * For example take a left-rotation by 1 bit. The right-slice will be a 63-bit integer.
+ * 63 does not evenly divide 8. i.e. an 8-bit table cannot correctly range-constrain the input slice and we would need
+ * additional range constraints.
+ * We solve this problem by creating multiple Rho lookup tables each of different sizes (1 bit, 2 bits, ..., 8 bits).
+ *
+ * We can stitch together a lookup table sequence that correctly range constrains the left/right slices for any of our
+ * 25 rotation values
+ *
+ * @tparam TABLE_BITS The number of bits each lookup table can accomodate
+ * @tparam LANE_INDEX Required by get_rho_output_table to produce the correct MultiTable
+ */
+template <size_t TABLE_BITS = 0, size_t LANE_INDEX = 0> class Rho {
+
+  public:
+    static constexpr uint64_t BASE = 11;
+
+    // EFFECTIVE_BASE = maximum value each input 'quasi-bit' can reach at this stage in the Keccak algo
+    // (we use base11 as it's a bit more efficient to use a consistent base across the entire Keccak hash algorithm.
+    //  The THETA round requires base-11 in order to most efficiently convert XOR operations into algebraic operations)
+    static constexpr uint64_t EFFECTIVE_BASE = 3;
+
+    // The maximum number of bits of a Rho lookup table (TABLE_BITS <= MAXIMUM_MULTITABLE_BITS).
+    // Used in get_rho_output_table
+    static constexpr size_t MAXIMUM_MULTITABLE_BITS = 8;
+
+    // Rotation offsets, y vertically, x horizontally: r[y * 5 + x]
+    static constexpr std::array<size_t, 25> ROTATIONS = {
+        0, 1, 62, 28, 27, 36, 44, 6, 55, 20, 3, 10, 43, 25, 39, 41, 45, 15, 21, 8, 18, 2, 61, 56, 14,
+    };
+
+    static constexpr uint64_t RHO_NORMALIZATION_TABLE[3]{
+        0,
+        1,
+        0,
+    };
+
+    /**
+     * @brief Given a table input value, return the table output value
+     *
+     * Used by the Plookup code to precompute lookup tables and generate witness values
+     *
+     * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value)
+     * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^TABLE_BITS - 1)
+     */
+    static std::array<barretenberg::fr, 2> get_rho_renormalization_values(const std::array<uint64_t, 2> key)
+    {
+        uint64_t accumulator = 0;
+        uint64_t input = key[0];
+        uint64_t base_shift = 1;
+        constexpr uint64_t divisor_exponent = TABLE_BITS - 1;
+        constexpr uint64_t divisor = numeric::pow64(BASE, divisor_exponent);
+        while (input > 0) {
+            uint64_t slice = input % BASE;
+            uint64_t bit = RHO_NORMALIZATION_TABLE[static_cast<size_t>(slice)];
+            accumulator += (bit * base_shift);
+            input /= BASE;
+            base_shift *= BASE;
+        }
+
+        return { barretenberg::fr(accumulator), barretenberg::fr(accumulator / divisor) };
+    }
+
+    /**
+     * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1])
+     * Code is slightly faster at runtime if we compute this at compile time
+     *
+     * @return constexpr std::array<uint64_t, TABLE_BITS>
+     */
+    static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases()
+    {
+        std::array<uint64_t, TABLE_BITS> result;
+        uint64_t acc = 1;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            result[i] = acc;
+            acc *= BASE;
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column values for next row of plookup table. Used to generate plookup table row values
+     *
+     * Input `counts` is an array of quasi-bits that represent the current row.
+     * Method increases `counts` by 1 and returns the plookup table column values.
+     *
+     * (a bit tricky to compute because each quasi-bit ranges from [0, 1, 2],
+     *  but we're working with base-11 numbers.
+     *  i.e. unlike most of our lookup tables, the 1st column is not uniformly increasing by a constant value!)
+     *
+     * @param counts The current row value represented as an array of quasi-bits
+     * @return std::array<uint64_t, 3> the columns of the plookup table
+     */
+    static std::array<uint64_t, 3> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts)
+    {
+        static constexpr auto scaled_bases = get_scaled_bases();
+
+        // want the most significant bit of the 64-bit integer, when this table is used on the most significant slice
+        constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS - 1);
+
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            if (counts[i] == EFFECTIVE_BASE - 1) {
+                counts[i] = 0;
+            } else {
+                counts[i] += 1;
+                break;
+            }
+        }
+
+        uint64_t value = 0;
+        uint64_t normalized_value = 0;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            value += counts[i] * scaled_bases[i];
+            normalized_value += (RHO_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i];
+        }
+        return { value, normalized_value, normalized_value / divisor };
+    }
+
+    /**
+     * @brief Generate plookup table that normalizes a TABLE_BITS-slice of a base-11 integer and extracts the msb
+     *
+     * @param id
+     * @param table_index
+     * @return BasicTable
+     */
+    static BasicTable generate_rho_renormalization_table(BasicTableId id, const size_t table_index)
+    {
+        BasicTable table;
+        table.id = id;
+        table.table_index = table_index;
+        table.use_twin_keys = false;
+        table.size = numeric::pow64(static_cast<uint64_t>(EFFECTIVE_BASE), TABLE_BITS);
+
+        std::array<size_t, TABLE_BITS> counts{};
+        std::array<uint64_t, 3> column_values{ 0, 0, 0 };
+
+        for (size_t i = 0; i < table.size; ++i) {
+            table.column_1.emplace_back(column_values[0]);
+            table.column_2.emplace_back(column_values[1]);
+            table.column_3.emplace_back(column_values[2]);
+            column_values = get_column_values_for_next_row(counts);
+        }
+
+        table.get_values_from_key = &get_rho_renormalization_values;
+
+        uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS);
+        table.column_1_step_size = barretenberg::fr(step_size);
+        table.column_2_step_size = barretenberg::fr(step_size);
+        table.column_3_step_size = barretenberg::fr(0);
+        return table;
+    }
+
+    /**
+     * @brief Create the Rho MultiTable used by plookup to generate a sequence of lookups
+     *
+     * Keccak operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits.
+     *
+     * Representing the RHO lookup as a sequence of accumulating sums is tricky because of rotations.
+     *
+     * For example, imagine our input is a 32-bit integer A represented as: A = A3.11^24 + A2.11^16 + A1.11^8 + A0,
+     *              and our output is a 32-bit integer B = B3.11^24 + B2.11^16 + B1.11^8 + B0
+     *
+     * In this example, we want to normalize A and left-rotate by 16 bits.
+     *
+     * Our lookup gate wire values will look like the following:
+     *
+     * | Row | C0                                       | C1           | C2       |
+     * | --- | -----------------------------------------| ------------ | -------- |
+     * |  0  | A3.11^24 + A2.11^16 + A1.11^8  + A0      | B1.11^8 + B0 | A0.msb() |
+     * |  1  |            A3.11^16 + A2.11^8  + A1      |           B1 | A1.msb() |
+     * |  2  |                       A1311^8  + A2      | B3.11^8 + B2 | A2.msb() |
+     * |  3  |                                  A3      |           B3 | A3.msb() |
+     *
+     * Finally, an addition gate is used to extract B = 11^32 * C1[0] + C1[2]
+     *
+     * We use the above structure because the plookup protocol derives lookup entries by taking:
+     *
+     *      Colunn1 = W1[i] + Q1 . W1[i + 1]
+     *      Colunn2 = W2[i] + Q2 . W2[i + 1]
+     *
+     * Where Q1, Q2 are constants encoded in selector polynomials.
+     * We do not have any spare selector polynomials to apply to W1[i] and W2[i] :(
+     *
+     * i.e. we cannot represent the column C1 as a sequence of accumulating sums whilst performing a bit rotation!
+     * The part of A that wraps around past 11^64 must be represented separately vs the part that does not.
+     *
+     * @param id
+     * @return MultiTable
+     */
+    static MultiTable get_rho_output_table(const MultiTableId id = KECCAK_NORMALIZE_AND_ROTATE)
+    {
+        constexpr size_t left_bits = ROTATIONS[LANE_INDEX];
+        constexpr size_t right_bits = 64 - ROTATIONS[LANE_INDEX];
+        constexpr size_t num_left_tables =
+            left_bits / MAXIMUM_MULTITABLE_BITS + (left_bits % MAXIMUM_MULTITABLE_BITS > 0 ? 1 : 0);
+        constexpr size_t num_right_tables =
+            right_bits / MAXIMUM_MULTITABLE_BITS + (right_bits % MAXIMUM_MULTITABLE_BITS > 0 ? 1 : 0);
+
+        MultiTable table;
+        table.id = id;
+
+        table.column_1_step_sizes.push_back(1);
+        table.column_2_step_sizes.push_back(1);
+        table.column_3_step_sizes.push_back(1);
+
+        // generate table selector values for the 'right' slice
+        barretenberg::constexpr_for<0, num_right_tables, 1>([&]<size_t i> {
+            constexpr size_t num_bits_processed = (i * MAXIMUM_MULTITABLE_BITS);
+            constexpr size_t bit_slice = (num_bits_processed + MAXIMUM_MULTITABLE_BITS > right_bits)
+                                             ? right_bits % MAXIMUM_MULTITABLE_BITS
+                                             : MAXIMUM_MULTITABLE_BITS;
+
+            constexpr uint64_t scaled_base = numeric::pow64(BASE, bit_slice);
+            if (i == num_right_tables - 1) {
+                table.column_1_step_sizes.push_back(scaled_base);
+                table.column_2_step_sizes.push_back(0);
+                table.column_3_step_sizes.push_back(0);
+            } else {
+                table.column_1_step_sizes.push_back(scaled_base);
+                table.column_2_step_sizes.push_back(scaled_base);
+                table.column_3_step_sizes.push_back(0);
+            }
+
+            table.slice_sizes.push_back(scaled_base);
+            table.get_table_values.emplace_back(&get_rho_renormalization_values);
+            table.lookup_ids.push_back((BasicTableId)((size_t)KECCAK_RHO_1 + (bit_slice - 1)));
+        });
+
+        // generate table selector values for the 'left' slice
+        barretenberg::constexpr_for<0, num_left_tables, 1>([&]<size_t i> {
+            constexpr size_t num_bits_processed = (i * MAXIMUM_MULTITABLE_BITS);
+
+            constexpr size_t bit_slice = (num_bits_processed + MAXIMUM_MULTITABLE_BITS > left_bits)
+                                             ? left_bits % MAXIMUM_MULTITABLE_BITS
+                                             : MAXIMUM_MULTITABLE_BITS;
+            constexpr uint64_t scaled_base = numeric::pow64(BASE, bit_slice);
+
+            if (i != num_left_tables - 1) {
+                table.column_1_step_sizes.push_back(scaled_base);
+                table.column_2_step_sizes.push_back(scaled_base);
+                table.column_3_step_sizes.push_back(0);
+            }
+
+            table.slice_sizes.push_back(scaled_base);
+            table.get_table_values.emplace_back(&get_rho_renormalization_values);
+            table.lookup_ids.push_back((BasicTableId)((size_t)KECCAK_RHO_1 + (bit_slice - 1)));
+        });
+
+        return table;
+    }
+};
+
+} // namespace keccak_tables
+} // namespace plookup
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp
new file mode 100644
index 0000000000..4a621c4d3c
--- /dev/null
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/keccak/keccak_theta.hpp
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <numeric/bitop/pow.hpp>
+#include <common/constexpr_utils.hpp>
+#include "../types.hpp"
+
+namespace plookup {
+namespace keccak_tables {
+
+/**
+ * @brief Generates plookup tables required for THETA round of Keccak hash function
+ *
+ * Keccak has 25 hash lanes, each represented as 64-bit integers. The THETA round performs the following operation on 3
+ * hash lanes:
+ *
+ * C0 = A0 ^ A1 ^ A2 ^ A3 ^ A4
+ * C1 = B0 ^ B1 ^ B2 ^ B3 ^ B4
+ * D  = C0 ^ ROTATE_LEFT(C1, 1)
+ *
+ * We evaluate in-circuit using a base-11 sparse integer representation:
+ *
+ * P = \sum_{j=0}^63 b_i * 11^i
+ *
+ * In this representation we evaluate CHI via the linear expression
+ *
+ * C0 = A0 + A1 + A2 + A3 + A4
+ * C1 = B0 + B1 + B2 + B3 + B4
+ * D  = C0 + ROTATE_LEFT(C1, 1)
+ *
+ * We use base-11 spare representation because the maximum value of each 'quasi-bit' of D is 10
+ *
+ * THETA round uses a plookup table that normalizes the algebraic output.
+ *
+ * This can be represented via the 'truth table' for each base-11 quasi-bit:
+ *
+ * | Algebraic Output | Binary Output |
+ * | ---------------- | ------------- |
+ * | 0                | 0             |
+ * | 1                | 1             |
+ * | 2                | 0             |
+ * | 3                | 1             |
+ * | 4                | 0             |
+ * | 5                | 1             |
+ * | 6                | 0             |
+ * | 7                | 1             |
+ * | 8                | 0             |
+ * | 9                | 1             |
+ * | 10               | 0             |
+ *
+ * i.e. even = 0, odd = 1
+ *
+ */
+class Theta {
+  public:
+    static constexpr size_t TABLE_BITS = 4;
+    static constexpr uint64_t BASE = 11;
+
+    static constexpr uint64_t THETA_NORMALIZATION_TABLE[11]{
+        0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+    };
+
+    // template <size_t i> static std::pair<uint64_t, uint64_t> update_counts(std::array<size_t, TABLE_BITS>& counts)
+    // {
+    //     ASSERT(i <= TABLE_BITS);
+    //     if constexpr (i >= TABLE_BITS) {
+    //         // TODO use concepts or template metaprogramming to put this condition in method declaration
+    //         return std::make_pair(0, 0);
+    //     } else {
+    //         if (counts[i] == BASE - 1) {
+    //             counts[i] = 0;
+    //             return update_counts<i + 1>(counts);
+    //         } else {
+    //             counts[i] += 1;
+    //         }
+
+    //         uint64_t value = 0;
+    //         uint64_t normalized_value = 0;
+    //         uint64_t cumulative_base = 1;
+    //         for (size_t j = 0; j < TABLE_BITS; ++j) {
+    //             value += counts[j] * cumulative_base;
+    //             normalized_value += (THETA_NORMALIZATION_TABLE[counts[j]]) * cumulative_base;
+    //             cumulative_base *= BASE;
+    //         }
+    //         return std::make_pair(value, normalized_value);
+    //     }
+    // }
+
+    /**
+     * @brief Given a table input value, return the table output value
+     *
+     * Used by the Plookup code to precompute lookup tables and generate witness values
+     *
+     * @param key (first element = table input. Second element is unused as this lookup does not have 2 keys per value)
+     * @return std::array<barretenberg::fr, 2> table output (normalized input and normalized input / 11^TABLE_BITS - 1)
+     */
+    static std::array<barretenberg::fr, 2> get_theta_renormalization_values(const std::array<uint64_t, 2> key)
+    {
+        uint64_t accumulator = 0;
+        uint64_t input = key[0];
+        uint64_t base_shift = 1;
+        while (input > 0) {
+            uint64_t slice = input % BASE;
+            uint64_t bit = THETA_NORMALIZATION_TABLE[static_cast<size_t>(slice)];
+            accumulator += (bit * base_shift);
+            input /= BASE;
+            base_shift *= BASE;
+        }
+        return { barretenberg::fr(accumulator), barretenberg::fr(0) };
+    }
+
+    /**
+     * @brief Precompute an array of base multipliers (11^i for i = [0, ..., TABLE_BITS - 1])
+     * Code is slightly faster at runtime if we compute this at compile time
+     *
+     * @return constexpr std::array<uint64_t, TABLE_BITS>
+     */
+    static constexpr std::array<uint64_t, TABLE_BITS> get_scaled_bases()
+    {
+        std::array<uint64_t, TABLE_BITS> result;
+        uint64_t acc = 1;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            result[i] = acc;
+            acc *= BASE;
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get column values for next row of plookup table. Used to generate plookup table row values
+     *
+     * Input `counts` is an array of quasi-bits that represent the current row.
+     * Method increases `counts` by 1 and returns the plookup table column values.
+     *
+     * @param counts The current row value represented as an array of quasi-bits
+     * @return std::array<uint64_t, 2> the columns of the plookup table
+     */
+    static std::array<uint64_t, 2> get_column_values_for_next_row(std::array<size_t, TABLE_BITS>& counts)
+    {
+        static constexpr auto scaled_bases = get_scaled_bases();
+
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            if (counts[i] == BASE - 1) {
+                counts[i] = 0;
+            } else {
+                counts[i] += 1;
+                break;
+            }
+        }
+
+        uint64_t value = 0;
+        uint64_t normalized_value = 0;
+        for (size_t i = 0; i < TABLE_BITS; ++i) {
+            value += counts[i] * scaled_bases[i];
+            normalized_value += (THETA_NORMALIZATION_TABLE[counts[i]]) * scaled_bases[i];
+        }
+        return { value, normalized_value };
+    }
+
+    /**
+     * @brief Generate plookup table that normalizes a TABLE_BITS-slice of a base-11 integer
+     *
+     * @param id
+     * @param table_index
+     * @return BasicTable
+     */
+    static BasicTable generate_theta_renormalization_table(BasicTableId id, const size_t table_index)
+    {
+        // max_base_value_plus_one sometimes may not equal base iff this is an intermediate lookup table
+        // (e.g. keccak, we have base11 values that need to be normalized where the actual values-per-base only range
+        // from [0, 1, 2])
+        BasicTable table;
+        table.id = id;
+        table.table_index = table_index;
+        table.use_twin_keys = false;
+        table.size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS);
+
+        std::array<size_t, TABLE_BITS> counts{};
+        std::array<uint64_t, 2> column_values{ 0, 0 };
+
+        for (size_t i = 0; i < table.size; ++i) {
+            table.column_1.emplace_back(column_values[0]);
+            table.column_2.emplace_back(column_values[1]);
+            table.column_3.emplace_back(0);
+            column_values = get_column_values_for_next_row(counts);
+        }
+
+        table.get_values_from_key = &get_theta_renormalization_values;
+
+        constexpr uint64_t step_size = numeric::pow64(static_cast<uint64_t>(BASE), TABLE_BITS);
+        table.column_1_step_size = barretenberg::fr(step_size);
+        table.column_2_step_size = barretenberg::fr(step_size);
+        table.column_3_step_size = barretenberg::fr(0);
+        return table;
+    }
+
+    /**
+     * @brief Create the THETA MultiTable used by plookup to generate a sequence of lookups
+     *
+     * The THETA round operates on 64-bit integers, but the lookup table only indexes TABLE_BITS bits.
+     *
+     * i.e. multiple lookups are required for a single 64-bit integer.
+     *
+     * If we group these lookups together, we can derive the plookup column values
+     * from the relative difference between wire values.
+     *
+     * i.e. we do not need to split our 64-bit input into TABLE_BITS slices, perform the lookup and add together the
+     * output slices
+     *
+     * Instead, e.g. for TABLE_BITS = 8 we have inputs A, B, C where
+     *      A = \sum_{i=0}^7 A_i * 11^8
+     *      B = \sum_{i=0}^7 B_i * 11^8
+     *      C_i = B_i / 11^8 (to get the most significant bit of B)
+     *
+     * Our plookup gates will produce a gates with the following wire values:
+     *
+     * | W1                      | W2                      | W3  |
+     * | ----------------------- | ----------------------- | --- |
+     * | \sum_{i=0}^7 A_i * 11^i | \sum_{i=0}^7 B_i * 11^i | --- |
+     * | \sum_{i=1}^7 A_i * 11^i | \sum_{i=1}^7 B_i * 11^i | --- |
+     * | \sum_{i=2}^7 A_i * 11^i | \sum_{i=2}^7 B_i * 11^i | --- |
+     * | ...                     | ...                     | ... |
+     * | A^7                     | B^7                     | --- |
+     *
+     * The plookup protocol extracts the 1st and 2nd lookup column values by taking:
+     *
+     *      Colunn1 = W1[i] - 11^8 . W1[i + 1]
+     *      Colunn2 = W2[i] - 11^8 . W2[i + 1]
+     *
+     * (where the -11^8 coefficient is stored in a precomputed selector polynomial)
+     *
+     * This MultiTable construction defines the value of these precomputed selector polynomial values,
+     * as well as defines how the column values are derived from a starting input value.
+     *
+     * @param id
+     * @return MultiTable
+     */
+    static MultiTable get_theta_output_table(const MultiTableId id = KECCAK_THETA_OUTPUT)
+    {
+        constexpr size_t num_tables_per_multitable =
+            (64 / TABLE_BITS) + (64 % TABLE_BITS == 0 ? 0 : 1); // 64 bits, 5 bits per entry
+
+        uint64_t column_multiplier = numeric::pow64(BASE, TABLE_BITS);
+        MultiTable table(column_multiplier, column_multiplier, 0, num_tables_per_multitable);
+
+        table.id = id;
+        for (size_t i = 0; i < num_tables_per_multitable; ++i) {
+            table.slice_sizes.emplace_back(numeric::pow64(BASE, TABLE_BITS));
+            table.lookup_ids.emplace_back(KECCAK_THETA);
+            table.get_table_values.emplace_back(&get_theta_renormalization_values);
+        }
+        return table;
+    }
+};
+} // namespace keccak_tables
+} // namespace plookup
\ No newline at end of file
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp
index 26cae5826a..299bc8966f 100644
--- a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.cpp
@@ -1,4 +1,5 @@
 #include "plookup_tables.hpp"
+#include <common/constexpr_utils.hpp>
 
 namespace plookup {
 
@@ -82,6 +83,19 @@ void init_multi_tables()
         blake2s_tables::get_blake2s_xor_rotate_8_table(MultiTableId::BLAKE_XOR_ROTATE_8);
     MULTI_TABLES[MultiTableId::BLAKE_XOR_ROTATE_7] =
         blake2s_tables::get_blake2s_xor_rotate_7_table(MultiTableId::BLAKE_XOR_ROTATE_7);
+    MULTI_TABLES[MultiTableId::KECCAK_FORMAT_INPUT] =
+        keccak_tables::KeccakInput::get_keccak_input_table(MultiTableId::KECCAK_FORMAT_INPUT);
+    MULTI_TABLES[MultiTableId::KECCAK_THETA_OUTPUT] =
+        keccak_tables::Theta::get_theta_output_table(MultiTableId::KECCAK_THETA_OUTPUT);
+    MULTI_TABLES[MultiTableId::KECCAK_CHI_OUTPUT] =
+        keccak_tables::Chi::get_chi_output_table(MultiTableId::KECCAK_CHI_OUTPUT);
+    MULTI_TABLES[MultiTableId::KECCAK_FORMAT_OUTPUT] =
+        keccak_tables::KeccakOutput::get_keccak_output_table(MultiTableId::KECCAK_FORMAT_OUTPUT);
+
+    barretenberg::constexpr_for<0, 25, 1>([&]<size_t i>() {
+        MULTI_TABLES[(size_t)MultiTableId::KECCAK_NORMALIZE_AND_ROTATE + i] =
+            keccak_tables::Rho<8, i>::get_rho_output_table(MultiTableId::KECCAK_NORMALIZE_AND_ROTATE);
+    });
 }
 } // namespace
 
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp
index 518e025982..e1e30bcb44 100644
--- a/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/plookup_tables.hpp
@@ -9,6 +9,12 @@
 #include "uint.hpp"
 #include "non_native_group_generator.hpp"
 #include "blake2s.hpp"
+#include "keccak/keccak_chi.hpp"
+#include "keccak/keccak_input.hpp"
+#include "keccak/keccak_output.hpp"
+#include "keccak/keccak_rho.hpp"
+#include "keccak/keccak_theta.hpp"
+
 namespace plookup {
 
 const MultiTable& create_table(const MultiTableId id);
@@ -236,6 +242,42 @@ inline BasicTable create_basic_table(const BasicTableId id, const size_t index)
     case PEDERSEN_IV_BASE: {
         return pedersen_tables::basic::generate_pedersen_iv_table(PEDERSEN_IV_BASE);
     }
+    case KECCAK_INPUT: {
+        return keccak_tables::KeccakInput::generate_keccak_input_table(KECCAK_INPUT, index);
+    }
+    case KECCAK_THETA: {
+        return keccak_tables::Theta::generate_theta_renormalization_table(KECCAK_THETA, index);
+    }
+    case KECCAK_CHI: {
+        return keccak_tables::Chi::generate_chi_renormalization_table(KECCAK_CHI, index);
+    }
+    case KECCAK_OUTPUT: {
+        return keccak_tables::KeccakOutput::generate_keccak_output_table(KECCAK_OUTPUT, index);
+    }
+    case KECCAK_RHO_1: {
+        return keccak_tables::Rho<1>::generate_rho_renormalization_table(KECCAK_RHO_1, index);
+    }
+    case KECCAK_RHO_2: {
+        return keccak_tables::Rho<2>::generate_rho_renormalization_table(KECCAK_RHO_2, index);
+    }
+    case KECCAK_RHO_3: {
+        return keccak_tables::Rho<3>::generate_rho_renormalization_table(KECCAK_RHO_3, index);
+    }
+    case KECCAK_RHO_4: {
+        return keccak_tables::Rho<4>::generate_rho_renormalization_table(KECCAK_RHO_4, index);
+    }
+    case KECCAK_RHO_5: {
+        return keccak_tables::Rho<5>::generate_rho_renormalization_table(KECCAK_RHO_5, index);
+    }
+    case KECCAK_RHO_6: {
+        return keccak_tables::Rho<6>::generate_rho_renormalization_table(KECCAK_RHO_6, index);
+    }
+    case KECCAK_RHO_7: {
+        return keccak_tables::Rho<7>::generate_rho_renormalization_table(KECCAK_RHO_7, index);
+    }
+    case KECCAK_RHO_8: {
+        return keccak_tables::Rho<8>::generate_rho_renormalization_table(KECCAK_RHO_8, index);
+    }
     default: {
         throw_or_abort("table id does not exist");
         return sparse_tables::generate_sparse_table_with_rotation<9, 8, 0>(AES_SPARSE_MAP, index);
diff --git a/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp b/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp
index c440459848..142b948d6b 100644
--- a/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp
+++ b/cpp/src/aztec/plonk/composer/plookup_tables/types.hpp
@@ -83,6 +83,20 @@ enum BasicTableId {
     PEDERSEN_1,
     PEDERSEN_0,
     PEDERSEN_IV_BASE,
+    KECCAK_INPUT,
+    KECCAK_THETA,
+    KECCAK_RHO,
+    KECCAK_CHI,
+    KECCAK_OUTPUT,
+    KECCAK_RHO_1,
+    KECCAK_RHO_2,
+    KECCAK_RHO_3,
+    KECCAK_RHO_4,
+    KECCAK_RHO_5,
+    KECCAK_RHO_6,
+    KECCAK_RHO_7,
+    KECCAK_RHO_8,
+    KECCAK_RHO_9,
 };
 
 enum MultiTableId {
@@ -122,7 +136,12 @@ enum MultiTableId {
     BLAKE_XOR_ROTATE_8,
     BLAKE_XOR_ROTATE_7,
     PEDERSEN_IV,
-    NUM_MULTI_TABLES,
+    KECCAK_THETA_OUTPUT,
+    KECCAK_CHI_OUTPUT,
+    KECCAK_FORMAT_INPUT,
+    KECCAK_FORMAT_OUTPUT,
+    KECCAK_NORMALIZE_AND_ROTATE,
+    NUM_MULTI_TABLES = KECCAK_NORMALIZE_AND_ROTATE + 25,
 };
 
 struct MultiTable {
diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.cpp b/cpp/src/aztec/plonk/composer/ultra_composer.cpp
index 6de7357e35..5cb997b1ef 100644
--- a/cpp/src/aztec/plonk/composer/ultra_composer.cpp
+++ b/cpp/src/aztec/plonk/composer/ultra_composer.cpp
@@ -1102,6 +1102,20 @@ std::vector<uint32_t> UltraComposer::decompose_into_default_range(const uint32_t
     }
 
     const uint64_t sublimb_mask = (1ULL << target_range_bitnum) - 1;
+
+    /**
+     * TODO: Support this commented-out code!
+     * At the moment, `decompose_into_default_range` generates a minimum of 1 arithmetic gate.
+     * This is not strictly required iff num_bits <= target_range_bitnum.
+     * However, this produces an edge-case where a variable is range-constrained but NOT present in an arithmetic gate.
+     * This in turn produces an unsatisfiable circuit (see `create_new_range_constraint`). We would need to check for
+     * and accomodate/reject this edge case to support not adding addition gates here if not reqiured
+     * if (num_bits <= target_range_bitnum) {
+     *     const uint64_t expected_range = (1ULL << num_bits) - 1ULL;
+     *     create_new_range_constraint(variable_index, expected_range);
+     *     return { variable_index };
+     * }
+     **/
     std::vector<uint64_t> sublimbs;
     std::vector<uint32_t> sublimb_indices;
 
diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.hpp b/cpp/src/aztec/plonk/composer/ultra_composer.hpp
index 74ce23509c..c5880d04db 100644
--- a/cpp/src/aztec/plonk/composer/ultra_composer.hpp
+++ b/cpp/src/aztec/plonk/composer/ultra_composer.hpp
@@ -150,6 +150,15 @@ class UltraComposer : public ComposerBase {
     void create_range_constraint(const uint32_t variable_index, const size_t num_bits, std::string const&)
     {
         if (num_bits <= DEFAULT_PLOOKUP_RANGE_BITNUM) {
+            /**
+             * N.B. if `variable_index` is not used in any arithmetic constraints, this will create an unsatisfiable
+             *      circuit!
+             *      this range constraint will increase the size of the 'sorted set' of range-constrained integers by 1.
+             *      The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all arithmetic
+             *      gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking for this
+             *      and throwing an error would require a refactor of the Composer to catelog all 'orphan' variables not
+             *      assigned to gates.
+             **/
             create_new_range_constraint(variable_index, 1ULL << num_bits);
         } else {
             decompose_into_default_range(variable_index, num_bits);
@@ -178,6 +187,10 @@ class UltraComposer : public ComposerBase {
      */
     virtual size_t get_num_gates() const override
     {
+        // if circuit finalised already added extra gates
+        if (circuit_finalised) {
+            return num_gates;
+        }
         size_t count = num_gates;
         size_t rangecount = 0;
         size_t romcount = 0;
@@ -208,7 +221,13 @@ class UltraComposer : public ComposerBase {
     {
         size_t count = num_gates;
         size_t rangecount = 0;
+        size_t constant_rangecount = 0;
         size_t romcount = 0;
+        size_t plookupcount = 0;
+        for (auto& table : lookup_tables) {
+            plookupcount += table.lookup_gates.size();
+            count -= table.lookup_gates.size();
+        }
         for (size_t i = 0; i < rom_arrays.size(); ++i) {
             for (size_t j = 0; j < rom_arrays[i].state.size(); ++j) {
                 if (rom_arrays[i].state[j][0] == UNINITIALIZED_MEMORY_RECORD) {
@@ -228,9 +247,15 @@ class UltraComposer : public ComposerBase {
             list_size += padding;
             rangecount += (list_size / gate_width);
             rangecount += 1; // we need to add 1 extra addition gates for every distinct range list
+
+            // rough estimate
+            const size_t constant_cost = static_cast<size_t>(list.second.target_range / 6);
+            constant_rangecount += constant_cost;
+            rangecount -= constant_cost;
         }
-        size_t total = count + romcount + rangecount;
-        std::cout << "gates = " << total << " (arith " << count << ", rom " << romcount << ", range " << rangecount
+        size_t total = count + romcount + rangecount + constant_rangecount + plookupcount;
+        std::cout << "gates = " << total << " (arith " << count << ", plookup " << plookupcount << ", rom " << romcount
+                  << ", range " << rangecount << ", range table init cost = " << constant_rangecount
                   << "), pubinp = " << public_inputs.size() << std::endl;
     }
 
diff --git a/cpp/src/aztec/stdlib/hash/CMakeLists.txt b/cpp/src/aztec/stdlib/hash/CMakeLists.txt
index d8b6078d10..c8aaf1e4b4 100644
--- a/cpp/src/aztec/stdlib/hash/CMakeLists.txt
+++ b/cpp/src/aztec/stdlib/hash/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(blake2s)
 add_subdirectory(blake3s)
 add_subdirectory(pedersen)
-add_subdirectory(sha256)
\ No newline at end of file
+add_subdirectory(sha256)
+add_subdirectory(keccak)
\ No newline at end of file
diff --git a/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt b/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt
new file mode 100644
index 0000000000..d15f3dc401
--- /dev/null
+++ b/cpp/src/aztec/stdlib/hash/keccak/CMakeLists.txt
@@ -0,0 +1 @@
+barretenberg_module(stdlib_keccak stdlib_primitives crypto_keccak)
\ No newline at end of file
diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp
new file mode 100644
index 0000000000..2a138a5a11
--- /dev/null
+++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.cpp
@@ -0,0 +1,620 @@
+#include "keccak.hpp"
+#include <plonk/composer/ultra_composer.hpp>
+#include <stdlib/primitives/uint/uint.hpp>
+#include <common/constexpr_utils.hpp>
+#include <numeric/bitop/sparse_form.hpp>
+namespace plonk {
+namespace stdlib {
+
+/**
+ * @brief Normalize a base-11 limb and left-rotate by keccak::ROTATIONS[lane_index] bits.
+ *        This method also extracts the most significant bit of the normalised rotated limb.
+ *        Used in the RHO and IOTA rounds and in `sponge_absorb`.
+ *
+ * Normalize process:
+ *  Input v = \sum_{i=0}^63 b_i * 11^i , where b is in range [0, 1, 2]
+ *  Output  = \sum_{i=0}^63 (b_i & 1) * 11^i (i.e. even values go to 0)
+ *
+ * Implementation is via a sequence of lookup tables
+ *
+ * @tparam lane_index What keccak lane are we working on?
+ * @param limb Input limb we want to normalize and rotate
+ * @param msb (return parameter) The most significant bit of the normalized and rotated limb
+ * @return field_t<Composer> The normalized and rotated output
+ */
+template <typename Composer>
+template <size_t lane_index>
+field_t<Composer> keccak<Composer>::normalize_and_rotate(const field_ct& limb, field_ct& msb)
+{
+    // left_bits = the number of bits that wrap around 11^64 (left_bits)
+    constexpr size_t left_bits = ROTATIONS[lane_index];
+
+    // right_bits = the number of bits that don't wrap
+    constexpr size_t right_bits = 64 - ROTATIONS[lane_index];
+
+    // TODO read from same source as plookup table code
+    constexpr size_t max_bits_per_table = plookup::keccak_tables::Rho<>::MAXIMUM_MULTITABLE_BITS;
+
+    // compute the number of lookups required for our left and right bit slices
+    constexpr size_t num_left_tables = left_bits / max_bits_per_table + (left_bits % max_bits_per_table > 0 ? 1 : 0);
+    constexpr size_t num_right_tables = right_bits / max_bits_per_table + (right_bits % max_bits_per_table > 0 ? 1 : 0);
+
+    // get the numerical value of the left and right bit slices
+    // (lookup table input values derived from left / right)
+    uint256_t input = limb.get_value();
+    constexpr uint256_t slice_divisor = BASE.pow(right_bits);
+    const auto [left, right] = input.divmod(slice_divisor);
+
+    // compute the normalized values for the left and right bit slices
+    // (lookup table output values derived from left_normalised / right_normalized)
+    uint256_t left_normalized = normalize_sparse(left);
+    uint256_t right_normalized = normalize_sparse(right);
+
+    /**
+     * manually construct the ReadData object required to generate plookup gate constraints.
+     * To explain in more detail: the input integer can be represented via two the bit slices [A, B]
+     * (A = left, B = right)
+     *
+     * For example, imagine our input is a 32-bit integer A represented as: A = A3.11^24 + A2.11^16 + A1.11^8 + A0,
+     *              and our output is a 32-bit integer B = B3.11^24 + B2.11^16 + B1.11^8 + B0
+     *
+     * In this example, we want to normalize A and left-rotate by 16 bits.
+     *
+     * Our lookup gate wire values will look like the following:
+     *
+     * | Row | C0                                       | C1           | C2       |
+     * | --- | -----------------------------------------| ------------ | -------- |
+     * |  0  | A3.11^24 + A2.11^16 + A1.11^8  + A0      | B1.11^8 + B0 | A0.msb() |
+     * |  1  |            A3.11^16 + A2.11^8  + A1      |           B1 | A1.msb() |
+     * |  2  |                       A1311^8  + A2      | B3.11^8 + B2 | A2.msb() |
+     * |  3  |                                  A3      |           B3 | A3.msb() |
+     *
+     * The plookup table keys + values are derived via the expression:
+     *
+     * C1[i] + C1[i+1].q1[i] = LOOKUP[C0[i] + C0[i+1].q0[i]]
+     *
+     * (the same applies for C2, however q2[i] = 0 for all rows)
+     *
+     * The plookup coefficients for the rows treat Column0 as a single accumulating sum,
+     * but Column1 is a pair of accumulating sums.
+     * In the above example, the q coefficient value are:
+     *
+     * | Row | Q1   | Q2   | Q3 |
+     * | --- | ---- | ---- | -- |
+     * |  0  | 11^8 | 11^8 | 0  |
+     * |  1  | 11^8 | 0    | 0  |
+     * |  2  | 11^8 | 11^8 | 0  |
+     * |  3  | 0    | 0    | 0  |
+     *
+     * stdlib::plookup cannot derive witnesses in the above pattern without a substantial rewrite,
+     * so we do it manually in this method!
+     **/
+    plookup::ReadData<barretenberg::fr> lookup;
+
+    // compute plookup witness values for a given slice
+    // (same lambda can be used to compute witnesses for left and right slices)
+    auto compute_lookup_witnesses_for_limb = [&]<size_t limb_bits, size_t num_lookups>(uint256_t& normalized) {
+        // (use a constexpr loop to make some pow and div operations compile-time)
+        barretenberg::constexpr_for<0, num_lookups, 1>([&]<size_t i> {
+            constexpr size_t num_bits_processed = i * max_bits_per_table;
+
+            // How many bits can this slice contain?
+            // We want to implicitly range-constrain `normalized < 11^{limb_bits}`,
+            // which means potentially using a lookup table that is not of size 11^{max_bits_per_table}
+            // for the most-significant slice
+            constexpr size_t bit_slice = (num_bits_processed + max_bits_per_table > limb_bits)
+                                             ? limb_bits % max_bits_per_table
+                                             : max_bits_per_table;
+
+            // current column values are tracked via 'input' and 'normalized'
+            lookup[ColumnIdx::C1].push_back(input);
+            lookup[ColumnIdx::C2].push_back(normalized);
+
+            constexpr uint64_t divisor = numeric::pow64(static_cast<uint64_t>(BASE), bit_slice);
+            constexpr uint64_t msb_divisor = divisor / static_cast<uint64_t>(BASE);
+
+            // compute the value of the most significant bit of this slice and store in C3
+            const auto [normalized_quotient, normalized_slice] = normalized.divmod(divisor);
+
+            // 256-bit divisions are expensive! cast to u64s when we don't need the extra bits
+            const uint64_t normalized_msb = (static_cast<uint64_t>(normalized_slice) / msb_divisor);
+            lookup[ColumnIdx::C3].push_back(normalized_msb);
+
+            // We need to provide a key/value object for this lookup in order for the Composer
+            // to compute the plookup sorted list commitment
+            const auto [input_quotient, input_slice] = input.divmod(divisor);
+            lookup.key_entries.push_back(
+                { { static_cast<uint64_t>(input_slice), 0 }, { normalized_slice, normalized_msb } });
+
+            // reduce the input and output by 11^{bit_slice}
+            input = input_quotient;
+            normalized = normalized_quotient;
+        });
+    };
+
+    // template lambda syntax is a little funky.
+    // Need to explicitly write `.template operator()` (instead of just `()`).
+    // Otherwise compiler cannot distinguish between `>` symbol referring to closing the template parameter list,
+    // OR `>` being a greater-than operator :/
+    compute_lookup_witnesses_for_limb.template operator()<right_bits, num_right_tables>(right_normalized);
+    compute_lookup_witnesses_for_limb.template operator()<left_bits, num_left_tables>(left_normalized);
+
+    // Call composer method to create plookup constraints.
+    // The MultiTable table index can be derived from `lane_idx`
+    // Each lane_idx has a different rotation amount, which changes sizes of left/right slices
+    // and therefore the selector constants required (i.e. the Q1, Q2, Q3 values in the earlier example)
+    const auto accumulator_witnesses = limb.context->create_gates_from_plookup_accumulators(
+        (plookup::MultiTableId)((size_t)KECCAK_NORMALIZE_AND_ROTATE + lane_index),
+        lookup,
+        limb.normalize().get_witness_index());
+
+    // extract the most significant bit of the normalized output from the final lookup entry in column C3
+    msb = field_ct::from_witness_index(limb.get_context(),
+                                       accumulator_witnesses[ColumnIdx::C3][num_left_tables + num_right_tables - 1]);
+
+    // Extract the witness that maps to the normalized right slice
+    const field_t<Composer> right_output =
+        field_t<Composer>::from_witness_index(limb.get_context(), accumulator_witnesses[ColumnIdx::C2][0]);
+
+    if (num_left_tables == 0) {
+        // if the left slice size is 0 bits (i.e. no rotation), return `right_output`
+        return right_output;
+    } else {
+        // Extract the normalized left slice
+        const field_t<Composer> left_output = field_t<Composer>::from_witness_index(
+            limb.get_context(), accumulator_witnesses[ColumnIdx::C2][num_right_tables]);
+
+        // Stitch the right/left slices together to create our rotated output
+        constexpr uint256_t shift = BASE.pow(ROTATIONS[lane_index]);
+        return (left_output + right_output * shift);
+    }
+}
+
+/**
+ * @brief Compute twisted representation of hash lane
+ *
+ * The THETA round requires computation of XOR(A, ROTL(B, 1))
+ *
+ * We do this via a 'twisted' base-11 representation.
+ *
+ * If the bit slices for a regular variable are arranged [b63, ..., b0],
+ * the twisted representation is a 65-bit variable [b63, ..., b0, b63]
+ *
+ * The equivalent of XOR(A, ROTL(B, 1)) is A.twist + 2B.twist (in base-11 form)
+ * The output is present in bit slices 1-64
+ *
+ * @tparam Composer
+ * @param internal
+ */
+template <typename Composer> void keccak<Composer>::compute_twisted_state(keccak_state& internal)
+{
+    for (size_t i = 0; i < 25; ++i) {
+        internal.twisted_state[i] = ((internal.state[i] * 11) + internal.state_msb[i]).normalize();
+    }
+}
+
+/**
+ * @brief THETA round
+ *
+ * @tparam Composer
+ *
+ * THETA consists of XOR operations as well as left rotations by 1 bit.
+ *
+ * We represent 64-bit integers in a base-11 representation where
+ *  limb = \sum_{i=0}^63 b_i * 11^i
+ *
+ * At the start of THETA, all b_i values are either 0 or 1
+ *
+ * We can efficiently evaluate XOR operations via simple additions!
+ * If b_i = even, this represents a bit value of 0
+ * If b_i = odd, this represents a bit value of 1
+ *
+ * The KECCAK_THETA_OUTPUT lookup table is used to 'normalize' base-11 integers,
+ * i.e. convert b_i values from [0, ..., 10] to [0, 1] where even == 0, odd == 1
+ *
+ * The choice of base for our representation effects the following:
+ * 1. the number of normalization lookups required to avoid overflowing the base
+ * 2. the cost of normalization lookups
+ *
+ * Bigger base reduces (1) but increases (2). For THETA, base-11 is optimal (I think...)
+ *
+ * ### HANDLING ROTATIONS
+ *
+ * We need to left-rotate the C[5] array by 1-bit to compute D[5]. Naive way is expensive so we cheat!
+ * When converting integers into base-11 representation, we use a lookup table column to give us the
+ * most significant bit of the integer.
+ *
+ * This enables us to create a 'twisted' representation of the integer in base-11:
+ *
+ * twisted_limb = (b_63) + \sum_{i=0}^63 b_i * 11^{i + 1}
+ *
+ * e.g. if limb's bit ordering is [0,   b63, ..., b1, b0 ]
+ * twisted limb bit ordering is   [b63, b62, ..., b0, b63]
+ *
+ * We want to be able to compute XOR(A, B.rotate_left(1)) and can do this via twisted representations
+ *
+ * The equivalent in base-11 world is twisted_A * 2 + twisted_B.
+ * The output of the XOR operation exists in bit-slices 1, ..., 63
+ * (which can be extracted by removing the least and most significant slices of the output)
+ * This is MUCH cheaper than the extra range constraints required for a naive left-rotation
+ *
+ * Total cost of theta = 20.5 gates per 5 lanes + 25 = 127.5 per round
+ */
+template <typename Composer> void keccak<Composer>::theta(keccak_state& internal)
+{
+    std::array<field_ct, 5> C;
+    std::array<field_ct, 5> D;
+
+    auto& state = internal.state;
+    const auto& twisted_state = internal.twisted_state;
+    for (size_t i = 0; i < 5; ++i) {
+
+        /**
+         * field_ct::accumulate can compute 5 addition operations in only 2 gates:
+         * Gate 0 wires [a0, a1, a2, a3]
+         * Gate 1 wires [b0, b1, b2, b3]
+         * b3 = a0 + a1 + a2 + a3
+         * b2 = b3 + b0 + b1
+         * (b2 is the output wire)
+         **/
+        C[i] = field_ct::accumulate({ twisted_state[i],
+                                      twisted_state[5 + i],
+                                      twisted_state[10 + i],
+                                      twisted_state[15 + i],
+                                      twisted_state[20 + i] });
+    }
+
+    /**
+     * Compute D by exploiting twisted representation
+     * to get a cheap left-rotation by 1 bit
+     */
+    for (size_t i = 0; i < 5; ++i) {
+        const auto non_shifted_equivalent = (C[(i + 4) % 5]);
+        const auto shifted_equivalent = C[(i + 1) % 5] * BASE;
+        D[i] = (non_shifted_equivalent + shifted_equivalent);
+    }
+
+    /**
+     * D contains 66 base-11 slices.
+     *
+     * We need to remove the 2 most significant slices as they
+     * are artifacts of our twist operation.
+     *
+     * We also need to 'normalize' D (i.e. convert each base value to be 0 or 1),
+     * to prevent our base from overflowing when we XOR D into internal.state
+     *
+     * 1. create sliced_D witness, plus lo and hi slices
+     * 2. validate D == lo + (sliced_D * 11) + (hi * 11^65)
+     * 3. feed sliced_D into KECCAK_THETA_OUTPUT lookup table
+     *
+     * KECCAK_THETA_OUTPUT currently splices its input into 16 4-bit slices (in base 11 i.e. from 0 to 11^4 - 1)
+     * This ensures that sliced_D is correctly range constrained to be < 11^64
+     */
+    static constexpr uint256_t divisor = BASE.pow(64);
+    static constexpr uint256_t multiplicand = BASE.pow(65);
+    for (size_t i = 0; i < 5; ++i) {
+        uint256_t D_native = D[i].get_value();
+        const auto [D_quotient, lo_native] = D_native.divmod(BASE);
+        const uint256_t hi_native = D_quotient / divisor;
+        const uint256_t mid_native = D_quotient - hi_native * divisor;
+
+        field_ct hi(witness_ct(internal.context, hi_native));
+        field_ct mid(witness_ct(internal.context, mid_native));
+        field_ct lo(witness_ct(internal.context, lo_native));
+
+        // assert equal should cost 1 gate (multipliers are all constants)
+        D[i].assert_equal((hi * multiplicand).add_two(mid * 11, lo));
+        internal.context->create_new_range_constraint(hi.get_witness_index(), static_cast<uint64_t>(BASE));
+        internal.context->create_new_range_constraint(lo.get_witness_index(), static_cast<uint64_t>(BASE));
+
+        // If number of bits in KECCAK_THETA_OUTPUT table does NOT cleanly divide 64,
+        // we need an additional range constraint to ensure that mid < 11^64
+        if constexpr (64 % plookup::keccak_tables::Theta::TABLE_BITS == 0) {
+            // N.B. we could optimize out 5 gates per round here but it's very fiddly...
+            // In previous section, D[i] = X + Y (non shifted equiv and shifted equiv)
+            // We also want to validate D[i] == hi' + mid' + lo (where hi', mid' are hi, mid scaled by constants)
+            // We *could* create a big addition gate to validate the previous logic w. following structure:
+            // | w1 | w2  | w3 | w4 |
+            // | -- | --- | -- | -- |
+            // | hi | mid | lo | X  |
+            // | P0 | P1  | P2 | Y  |
+            // To save a gate, we would need to place the wires for the first KECCAK_THETA_OUTPUT plookup gate
+            // at P0, P1, P2. This is fiddly composer logic that is circuit-width-dependent
+            // (this would save 120 gates per hash block... not worth making the code less readable for that)
+            D[i] = plookup_read::read_from_1_to_2_table(KECCAK_THETA_OUTPUT, mid);
+        } else {
+            const auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_THETA_OUTPUT, D[i]);
+            D[i] = accumulators[ColumnIdx::C2][0];
+
+            // Ensure input to lookup is < 11^64,
+            // by validating most significant input slice is < 11^{64 mod slice_bits}
+            const field_ct most_significant_slice = accumulators[ColumnIdx::C1][accumulators[ColumnIdx::C1].size() - 1];
+
+            // N.B. cheaper to validate (11^{64 mod slice_bits} - slice < 2^14) as this
+            // prevents an extra range table from being created
+            constexpr uint256_t maximum = BASE.pow(64 % plookup::keccak_tables::Theta::TABLE_BITS);
+            const field_ct target = -most_significant_slice + maximum;
+            ASSERT(((uint256_t(1) << Composer::DEFAULT_PLOOKUP_RANGE_BITNUM) - 1) > maximum);
+            target.create_range_constraint(Composer::DEFAULT_PLOOKUP_RANGE_BITNUM,
+                                           "input to KECCAK_THETA_OUTPUT too large!");
+        }
+    }
+
+    // compute state[j * 5 + i] XOR D[i] in base-11 representation
+    for (size_t i = 0; i < 5; ++i) {
+        for (size_t j = 0; j < 5; ++j) {
+            state[j * 5 + i] = state[j * 5 + i] + D[i];
+        }
+    }
+}
+
+/**
+ * @brief RHO round
+ *
+ * @tparam Composer
+ *
+ * The limbs of internal.state are represented via base-11 integers
+ *  limb = \sum_{i=0}^63 b_i * 11^i
+ * The value of each b_i can be in the range [0, 1, 2] due to the THETA round XOR operations
+ *
+ * We need to do the following:
+ *
+ * 1. 'normalize' each limb so that each b_i value is 0 or 1
+ * 2. left-rotate each limb as defined by the keccak `rotations` matrix
+ *
+ * The KECCAK_RHO_OUTPUT lookup table is used for both. See `normalize_and_rotate` for more details
+ *
+ * COST PER LIMB...
+ *     8 gates for first lane (no rotation. Lookup table is 8-bits per slice = 8 lookups for 64 bits)
+ *     10 gates for other 24 lanes (lookup sequence is split into 6 8-bit slices and 2 slices that sum to 8 bits,
+ *     an addition gate is required to complete the rotation)
+ *
+ * Total costs is 248 gates.
+ *
+ * N.B. Can reduce lookup costs by using larger lookup tables.
+ * Current algo is optimized for lookup tables where sum of all table sizes is < 2^64
+ */
+template <typename Composer> void keccak<Composer>::rho(keccak_state& internal)
+{
+    constexpr_for<0, 25, 1>(
+        [&]<size_t i>() { internal.state[i] = normalize_and_rotate<i>(internal.state[i], internal.state_msb[i]); });
+}
+
+/**
+ * @brief PI
+ *
+ * PI permutes the keccak lanes. Adds 0 constraints as this is simply a
+ * re-ordering of witnesses
+ *
+ * @tparam Composer
+ * @param internal
+ */
+template <typename Composer> void keccak<Composer>::pi(keccak_state& internal)
+{
+    std::array<field_ct, 25> B;
+
+    for (size_t j = 0; j < 5; ++j) {
+        for (size_t i = 0; i < 5; ++i) {
+            B[j * 5 + i] = internal.state[j * 5 + i];
+        }
+    }
+
+    for (size_t y = 0; y < 5; ++y) {
+        for (size_t x = 0; x < 5; ++x) {
+            size_t u = (0 * x + 1 * y) % 5;
+            size_t v = (2 * x + 3 * y) % 5;
+
+            internal.state[v * 5 + u] = B[5 * y + x];
+        }
+    }
+}
+
+/**
+ * @brief CHI
+ *
+ * The CHI round applies the following logic to the hash lanes:
+ *     A XOR (~B AND C)
+ *
+ * In base-11 representation we can create an equivalent linear operation:
+ *     1 + 2A - B + C
+ *
+ * Output values will range from [0, 1, 2, 3, 4] and are mapped back into [0, 1]
+ * via the KECCAK_CHI_OUTPUT lookup table
+ *
+ * N.B. the KECCAK_CHI_OUTPUT table also has a column for the most significant bit of each lookup.
+ *      We use this to create a 'twisted representation of each hash lane (see THETA comments for more details)
+ * @tparam Composer
+ */
+template <typename Composer> void keccak<Composer>::chi(keccak_state& internal)
+{
+    // (cost = 12 * 25 = 300?)
+    auto& state = internal.state;
+
+    for (size_t y = 0; y < 5; ++y) {
+        std::array<field_ct, 5> lane_outputs;
+        for (size_t x = 0; x < 5; ++x) {
+            const auto A = state[y * 5 + x];
+            const auto B = state[y * 5 + ((x + 1) % 5)];
+            const auto C = state[y * 5 + ((x + 2) % 5)];
+
+            // vv should cost 1 gate
+            lane_outputs[x] = (A + A + CHI_OFFSET).add_two(-B, C);
+        }
+        for (size_t x = 0; x < 5; ++x) {
+            // Normalize lane outputs and assign to internal.state
+            auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_CHI_OUTPUT, lane_outputs[x]);
+            internal.state[y * 5 + x] = accumulators[ColumnIdx::C2][0];
+            internal.state_msb[y * 5 + x] = accumulators[ColumnIdx::C3][accumulators[ColumnIdx::C3].size() - 1];
+        }
+    }
+}
+
+/**
+ * @brief IOTA
+ *
+ * XOR first hash limb with a precomputed constant.
+ * We re-use the RHO_OUTPUT table to normalize after this operation
+ * @tparam Composer
+ * @param internal
+ * @param round
+ */
+template <typename Composer> void keccak<Composer>::iota(keccak_state& internal, size_t round)
+{
+    const field_ct xor_result = internal.state[0] + SPARSE_RC[round];
+
+    // normalize lane value so that we don't overflow our base11 modulus boundary in the next round
+    internal.state[0] = normalize_and_rotate<0>(xor_result, internal.state_msb[0]);
+
+    // No need to add constraints to compute twisted repr if this is the last round
+    if (round != 23) {
+        compute_twisted_state(internal);
+    }
+}
+
+template <typename Composer> void keccak<Composer>::keccakf1600(keccak_state& internal)
+{
+    for (size_t i = 0; i < 24; ++i) {
+        theta(internal);
+        rho(internal);
+        pi(internal);
+        chi(internal);
+        iota(internal, i);
+    }
+}
+
+template <typename Composer>
+void keccak<Composer>::sponge_absorb(keccak_state& internal,
+                                     const std::vector<field_ct>& input_buffer,
+                                     const std::vector<field_ct>& msb_buffer)
+{
+    const size_t l = input_buffer.size();
+
+    const size_t num_blocks = l / (BLOCK_SIZE / 8);
+
+    for (size_t i = 0; i < num_blocks; ++i) {
+        if (i == 0) {
+            for (size_t j = 0; j < LIMBS_PER_BLOCK; ++j) {
+                internal.state[j] = input_buffer[j];
+                internal.state_msb[j] = msb_buffer[j];
+            }
+            for (size_t j = LIMBS_PER_BLOCK; j < 25; ++j) {
+                internal.state[j] = witness_ct::create_constant_witness(internal.context, 0);
+                internal.state_msb[j] = witness_ct::create_constant_witness(internal.context, 0);
+            }
+        } else {
+            for (size_t j = 0; j < LIMBS_PER_BLOCK; ++j) {
+                internal.state[j] += input_buffer[i * LIMBS_PER_BLOCK + j];
+
+                internal.state[j] = normalize_and_rotate<0>(internal.state[j], internal.state_msb[j]);
+            }
+        }
+
+        compute_twisted_state(internal);
+        keccakf1600(internal);
+    }
+}
+
+template <typename Composer> byte_array<Composer> keccak<Composer>::sponge_squeeze(keccak_state& internal)
+{
+    byte_array_ct result(internal.context);
+
+    // Each hash limb represents a little-endian integer. Need to reverse bytes before we write into the output array
+    for (size_t i = 0; i < 4; ++i) {
+        field_ct output_limb = plookup_read::read_from_1_to_2_table(KECCAK_FORMAT_OUTPUT, internal.state[i]);
+        byte_array_ct limb_bytes(output_limb, 8);
+        byte_array_ct little_endian_limb_bytes(internal.context, 8);
+        little_endian_limb_bytes.set_byte(0, limb_bytes[7]);
+        little_endian_limb_bytes.set_byte(1, limb_bytes[6]);
+        little_endian_limb_bytes.set_byte(2, limb_bytes[5]);
+        little_endian_limb_bytes.set_byte(3, limb_bytes[4]);
+        little_endian_limb_bytes.set_byte(4, limb_bytes[3]);
+        little_endian_limb_bytes.set_byte(5, limb_bytes[2]);
+        little_endian_limb_bytes.set_byte(6, limb_bytes[1]);
+        little_endian_limb_bytes.set_byte(7, limb_bytes[0]);
+        result.write(little_endian_limb_bytes);
+    }
+    return result;
+}
+
+template <typename Composer> stdlib::byte_array<Composer> keccak<Composer>::hash(byte_array_ct& input)
+{
+    auto ctx = input.get_context();
+
+    if (ctx == nullptr) {
+        // if buffer is constant compute hash and return w/o creating constraints
+        byte_array_ct output(nullptr, 32);
+        const std::vector<uint8_t> result = hash_native(input.get_value());
+        for (size_t i = 0; i < 32; ++i) {
+            output.set_byte(i, result[i]);
+        }
+        return output;
+    }
+
+    const size_t input_size = input.size();
+
+    // copy input into buffer and pad
+    const size_t blocks = input_size / BLOCK_SIZE;
+    const size_t blocks_length = (BLOCK_SIZE * (blocks + 1));
+
+    byte_array_ct block_bytes(input);
+
+    const size_t byte_difference = blocks_length - input_size;
+    byte_array_ct padding_bytes(ctx, byte_difference);
+    for (size_t i = 0; i < byte_difference; ++i) {
+        padding_bytes.set_byte(i, witness_ct::create_constant_witness(ctx, 0));
+    }
+
+    block_bytes.write(padding_bytes);
+    block_bytes.set_byte(input_size, witness_ct::create_constant_witness(ctx, 0x1));
+    block_bytes.set_byte(block_bytes.size() - 1, witness_ct::create_constant_witness(ctx, 0x80));
+
+    // keccak lanes interpret memory as little-endian integers,
+    // means we need to swap our byte ordering...
+    for (size_t i = 0; i < block_bytes.size(); i += 8) {
+        std::array<field_ct, 8> temp;
+        for (size_t j = 0; j < 8; ++j) {
+            temp[j] = block_bytes[i + j];
+        }
+        block_bytes.set_byte(i, temp[7]);
+        block_bytes.set_byte(i + 1, temp[6]);
+        block_bytes.set_byte(i + 2, temp[5]);
+        block_bytes.set_byte(i + 3, temp[4]);
+        block_bytes.set_byte(i + 4, temp[3]);
+        block_bytes.set_byte(i + 5, temp[2]);
+        block_bytes.set_byte(i + 6, temp[1]);
+        block_bytes.set_byte(i + 7, temp[0]);
+    }
+    const size_t byte_size = block_bytes.size();
+    keccak_state internal;
+    internal.context = ctx;
+
+    const size_t num_limbs = byte_size / WORD_SIZE;
+    std::vector<field_ct> converted_buffer(num_limbs);
+    std::vector<field_ct> msb_buffer(num_limbs);
+
+    for (size_t i = 0; i < num_limbs; ++i) {
+        field_ct sliced;
+        if (i * WORD_SIZE + WORD_SIZE > byte_size) {
+            const size_t slice_size = byte_size - (i * WORD_SIZE);
+            const size_t byte_shift = (WORD_SIZE - slice_size) * 8;
+            sliced = field_ct(block_bytes.slice(i * WORD_SIZE, slice_size));
+            sliced = (sliced * (uint256_t(1) << byte_shift)).normalize();
+        } else {
+            sliced = field_ct(block_bytes.slice(i * WORD_SIZE, WORD_SIZE));
+        }
+        const auto accumulators = plookup_read::get_lookup_accumulators(KECCAK_FORMAT_INPUT, sliced);
+        converted_buffer[i] = accumulators[ColumnIdx::C2][0];
+        msb_buffer[i] = accumulators[ColumnIdx::C3][accumulators[ColumnIdx::C3].size() - 1];
+    }
+
+    sponge_absorb(internal, converted_buffer, msb_buffer);
+
+    auto result = sponge_squeeze(internal);
+
+    return result;
+}
+
+template class keccak<plonk::UltraComposer>;
+
+} // namespace stdlib
+} // namespace plonk
\ No newline at end of file
diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp
new file mode 100644
index 0000000000..c71612392d
--- /dev/null
+++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.hpp
@@ -0,0 +1,189 @@
+#pragma once
+#include <array>
+#include <stdlib/primitives/uint/uint.hpp>
+#include <stdlib/primitives/packed_byte_array/packed_byte_array.hpp>
+#include <stdlib/primitives/byte_array/byte_array.hpp>
+
+namespace plonk {
+class UltraComposer;
+} // namespace plonk
+
+namespace plonk {
+namespace stdlib {
+template <typename Composer> class bit_array;
+
+/**
+ * @brief KECCAAAAAAAAAAK
+ *
+ * Creates constraints that evaluate the Keccak256 hash algorithm.
+ *
+ * UltraPlonk only due to heavy lookup table use.
+ *
+ * Current cost 17,329 constraints for a 1-block hash
+ * using small(ish) lookup tables (total size < 2^64)
+ *
+ * @tparam Composer
+ */
+template <typename Composer> class keccak {
+  public:
+    using witness_ct = stdlib::witness_t<Composer>;
+    using field_ct = stdlib::field_t<Composer>;
+    using byte_array_ct = stdlib::byte_array<Composer>;
+
+    // base of extended representation we use for efficient logic operations
+    static constexpr uint256_t BASE = 11;
+
+    // number of bits of hash output
+    static constexpr size_t BITS = 256;
+
+    // word size of hash lane
+    static constexpr size_t WORD_SIZE = 8;
+
+    // block size. We only support keccak256 with a 1088-bit rate! This is what Ethereum uses
+    static constexpr size_t BLOCK_SIZE = (1600 - BITS * 2) / WORD_SIZE;
+
+    // how many limbs fit into a block (17)
+    static constexpr size_t LIMBS_PER_BLOCK = BLOCK_SIZE / 8;
+
+    // round constants. Used in IOTA round
+    static constexpr std::array<uint64_t, 24> RC = {
+        0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000, 0x000000000000808b,
+        0x0000000080000001, 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, 0x0000000000000088,
+        0x0000000080008009, 0x000000008000000a, 0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+        0x8000000000008003, 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
+        0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+    };
+
+    // Rotation offsets, y vertically, x horizontally: r[y * 5 + x]
+    static constexpr std::array<size_t, 25> ROTATIONS = {
+        0, 1, 62, 28, 27, 36, 44, 6, 55, 20, 3, 10, 43, 25, 39, 41, 45, 15, 21, 8, 18, 2, 61, 56, 14,
+    };
+
+    /**
+     * @brief Convert a binary integer into a base11 integer
+     *
+     * Input  = \sum_{i=0}^63 b_i * 2^i
+     * Output = \sum_{i=0}^63 b_i * 11^i
+     *
+     * @param input
+     * @return constexpr uint256_t sparse form of input
+     */
+    static constexpr uint256_t convert_to_sparse(uint256_t input)
+    {
+        std::array<uint64_t, 64> out_bits;
+        size_t count = 0;
+        while (input > 0) {
+            uint64_t bit = static_cast<uint64_t>(input & 1);
+            out_bits[count++] = bit;
+            input = input >> 1;
+        }
+        uint256_t output = 0;
+        for (size_t i = 0; i < count; ++i) {
+            output *= BASE;
+            output += out_bits[count - 1 - i];
+        }
+        return output;
+    };
+
+    /**
+     * @brief Normalize a base-11 integer where each base value can be > 1
+     *
+     * Input  = \sum_{i=0}^63 b_i * 11^i
+     * Output = \sum_{i=0}^63 (b_i & 1) * 11^i
+     *
+     * (XORs are evaluated by adding integers in sparse-form and normalizing. Even = 0, Odd = 1)
+     *
+     * @param input
+     * @return constexpr uint256_t
+     */
+    static constexpr uint256_t normalize_sparse(uint256_t input)
+    {
+        std::array<uint64_t, 64> out_bits;
+        size_t count = 0;
+        while (input > 0) {
+            const auto [quotient, slice] = input.divmod(BASE);
+            uint64_t bit = static_cast<uint64_t>(slice) & 1;
+            out_bits[count++] = bit;
+            input = quotient;
+        }
+        uint256_t out;
+        for (size_t i = 0; i < count; ++i) {
+            out *= BASE;
+            out += out_bits[count - 1 - i];
+        }
+        return out;
+    }
+
+    /**
+     * @brief Get the sparse round constants object
+     *
+     * @return constexpr std::array<uint256_t, 24>
+     */
+    static constexpr std::array<uint256_t, 24> get_sparse_round_constants()
+    {
+        std::array<uint256_t, 24> output;
+        for (size_t i = 0; i < 24; ++i) {
+            output[i] = convert_to_sparse(RC[i]);
+        }
+        return output;
+    }
+    static constexpr std::array<uint256_t, 24> SPARSE_RC = get_sparse_round_constants();
+
+    /**
+     * @brief Compute the constant offset added in the `Chi` round
+     *
+     * We want to compute, for each bit slice of the inputs A, B, C...
+     *  1 + 2A - B + C
+     *
+     * i.e. we need to add the constant value \sum_{i=0}^63 11^i
+     *
+     * @return constexpr uint256_t
+     */
+    static constexpr uint256_t get_chi_offset()
+    {
+        uint256_t result = 0;
+        for (size_t i = 0; i < 64; ++i) {
+            result *= 11;
+            result += 1;
+        }
+        return result;
+    }
+    static constexpr uint256_t CHI_OFFSET = get_chi_offset();
+
+    struct keccak_state {
+        std::array<field_ct, 25> state;
+        std::array<field_ct, 25> state_msb;
+        std::array<field_ct, 25> twisted_state;
+        Composer* context;
+    };
+
+    template <size_t lane_index> static field_t<Composer> normalize_and_rotate(const field_ct& limb, field_ct& msb);
+    static void compute_twisted_state(keccak_state& internal);
+    static void theta(keccak_state& state);
+    static void rho(keccak_state& state);
+    static void pi(keccak_state& state);
+    static void chi(keccak_state& state);
+    static void iota(keccak_state& state, size_t round);
+    static void sponge_absorb(keccak_state& internal,
+                              const std::vector<field_ct>& input_buffer,
+                              const std::vector<field_ct>& msb_buffer);
+    static byte_array_ct sponge_squeeze(keccak_state& internal);
+    static void keccakf1600(keccak_state& state);
+    static byte_array_ct hash(byte_array_ct& input);
+
+    static std::vector<uint8_t> hash_native(const std::vector<uint8_t>& data)
+    {
+        auto hash_result = ethash_keccak256(&data[0], data.size());
+
+        std::vector<uint8_t> output;
+        output.resize(32);
+
+        memcpy((void*)&output[0], (void*)&hash_result.word64s[0], 32);
+        return output;
+    }
+};
+
+extern template class keccak<plonk::UltraComposer>;
+
+} // namespace stdlib
+} // namespace plonk
diff --git a/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp b/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp
new file mode 100644
index 0000000000..793ed5bdf1
--- /dev/null
+++ b/cpp/src/aztec/stdlib/hash/keccak/keccak.test.cpp
@@ -0,0 +1,221 @@
+#include "keccak.hpp"
+#include <crypto/keccak/keccak.hpp>
+#include <gtest/gtest.h>
+#include <plonk/composer/ultra_composer.hpp>
+#include <numeric/random/engine.hpp>
+#include "../../primitives/plookup/plookup.hpp"
+
+using namespace barretenberg;
+using namespace plonk;
+
+typedef plonk::UltraComposer Composer;
+typedef stdlib::byte_array<Composer> byte_array;
+typedef stdlib::public_witness_t<Composer> public_witness_t;
+typedef stdlib::field_t<Composer> field_ct;
+typedef stdlib::witness_t<Composer> witness_ct;
+
+namespace {
+auto& engine = numeric::random::get_debug_engine();
+}
+
+namespace std {
+inline std::ostream& operator<<(std::ostream& os, std::vector<uint8_t> const& t)
+{
+    os << "[ ";
+    for (auto e : t) {
+        os << std::setfill('0') << std::hex << std::setw(2) << (int)e << " ";
+    }
+    os << "]";
+    return os;
+}
+} // namespace std
+
+TEST(stdlib_keccak, keccak_format_input_table)
+{
+    Composer composer = Composer();
+
+    for (size_t i = 0; i < 25; ++i) {
+        uint64_t limb_native = engine.get_random_uint64();
+        field_ct limb(witness_ct(&composer, limb_native));
+        stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_FORMAT_INPUT, limb);
+    }
+    auto prover = composer.create_prover();
+    auto verifier = composer.create_verifier();
+
+    auto proof = prover.construct_proof();
+
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, keccak_format_output_table)
+{
+    Composer composer = Composer();
+
+    for (size_t i = 0; i < 25; ++i) {
+        uint64_t limb_native = engine.get_random_uint64();
+        uint256_t extended_native = stdlib::keccak<Composer>::convert_to_sparse(limb_native);
+        field_ct limb(witness_ct(&composer, extended_native));
+        stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_FORMAT_OUTPUT, limb);
+    }
+    auto prover = composer.create_prover();
+    auto verifier = composer.create_verifier();
+    auto proof = prover.construct_proof();
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, keccak_theta_output_table)
+{
+    Composer composer = Composer();
+
+    for (size_t i = 0; i < 25; ++i) {
+        uint256_t extended_native = 0;
+        for (size_t j = 0; j < 8; ++j) {
+            extended_native *= 11;
+            uint64_t base_value = (engine.get_random_uint64() % 11);
+            extended_native += base_value;
+        }
+        field_ct limb(witness_ct(&composer, extended_native));
+        stdlib::plookup_read::read_from_1_to_2_table(plookup::KECCAK_THETA_OUTPUT, limb);
+    }
+    auto prover = composer.create_prover();
+    auto verifier = composer.create_verifier();
+    auto proof = prover.construct_proof();
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, keccak_rho_output_table)
+{
+    Composer composer = Composer();
+
+    barretenberg::constexpr_for<0, 25, 1>([&]<size_t i> {
+        uint256_t extended_native = 0;
+        uint256_t binary_native = 0;
+        for (size_t j = 0; j < 64; ++j) {
+            extended_native *= 11;
+            binary_native = binary_native << 1;
+            uint64_t base_value = (engine.get_random_uint64() % 3);
+            extended_native += base_value;
+            binary_native += (base_value & 1);
+        }
+        const size_t left_bits = stdlib::keccak<Composer>::ROTATIONS[i];
+        const size_t right_bits = 64 - left_bits;
+        const uint256_t left = binary_native >> right_bits;
+        const uint256_t right = binary_native - (left << right_bits);
+        const uint256_t binary_rotated = left + (right << left_bits);
+
+        const uint256_t expected_limb = stdlib::keccak<Composer>::convert_to_sparse(binary_rotated);
+        // msb only is correct iff rotation == 0 (no need to get msb for rotated lookups)
+        const uint256_t expected_msb = (binary_native >> 63);
+        field_ct limb(witness_ct(&composer, extended_native));
+        field_ct result_msb;
+        field_ct result_limb = stdlib::keccak<Composer>::normalize_and_rotate<i>(limb, result_msb);
+        EXPECT_EQ(static_cast<uint256_t>(result_limb.get_value()), expected_limb);
+        EXPECT_EQ(static_cast<uint256_t>(result_msb.get_value()), expected_msb);
+    });
+
+    std::cout << "create prover?" << std::endl;
+    auto prover = composer.create_prover();
+    std::cout << "create verifier" << std::endl;
+    printf("composer gates = %zu\n", composer.get_num_gates());
+    auto verifier = composer.create_verifier();
+    std::cout << "make proof" << std::endl;
+    auto proof = prover.construct_proof();
+    std::cout << "verify proof" << std::endl;
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, keccak_chi_output_table)
+{
+    static constexpr uint64_t chi_normalization_table[5]{
+        0, // 1 + 2a - b + c => a xor (~b & c)
+        0, 1, 1, 0,
+    };
+    Composer composer = Composer();
+
+    for (size_t i = 0; i < 25; ++i) {
+        uint256_t normalized_native = 0;
+        uint256_t extended_native = 0;
+        uint256_t binary_native = 0;
+        for (size_t j = 0; j < 8; ++j) {
+            extended_native *= 11;
+            normalized_native *= 11;
+            binary_native = binary_native << 1;
+            uint64_t base_value = (engine.get_random_uint64() % 5);
+            extended_native += base_value;
+            normalized_native += chi_normalization_table[base_value];
+            binary_native += chi_normalization_table[base_value];
+        }
+        field_ct limb(witness_ct(&composer, extended_native));
+        const auto accumulators = stdlib::plookup_read::get_lookup_accumulators(plookup::KECCAK_CHI_OUTPUT, limb);
+
+        field_ct normalized = accumulators[plookup::ColumnIdx::C2][0];
+        field_ct msb = accumulators[plookup::ColumnIdx::C3][accumulators[plookup::ColumnIdx::C3].size() - 1];
+
+        EXPECT_EQ(static_cast<uint256_t>(normalized.get_value()), normalized_native);
+        EXPECT_EQ(static_cast<uint256_t>(msb.get_value()), binary_native >> 63);
+    }
+    printf("composer gates = %zu\n", composer.get_num_gates());
+    auto prover = composer.create_prover();
+    auto verifier = composer.create_verifier();
+    std::cout << "make proof" << std::endl;
+    auto proof = prover.construct_proof();
+    std::cout << "verify proof" << std::endl;
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, test_single_block)
+{
+    Composer composer = Composer();
+    std::string input = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz01";
+    std::vector<uint8_t> input_v(input.begin(), input.end());
+
+    byte_array input_arr(&composer, input_v);
+    byte_array output = stdlib::keccak<Composer>::hash(input_arr);
+
+    std::vector<uint8_t> expected = stdlib::keccak<Composer>::hash_native(input_v);
+
+    EXPECT_EQ(output.get_value(), expected);
+
+    composer.print_num_gates();
+
+    auto prover = composer.create_prover();
+    std::cout << "prover circuit_size = " << prover.key->circuit_size << std::endl;
+    auto verifier = composer.create_verifier();
+
+    auto proof = prover.construct_proof();
+
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}
+
+TEST(stdlib_keccak, test_double_block)
+{
+    Composer composer = Composer();
+    std::string input = "";
+    for (size_t i = 0; i < 200; ++i) {
+        input += "a";
+    }
+    std::vector<uint8_t> input_v(input.begin(), input.end());
+
+    byte_array input_arr(&composer, input_v);
+    byte_array output = stdlib::keccak<Composer>::hash(input_arr);
+
+    std::vector<uint8_t> expected = stdlib::keccak<Composer>::hash_native(input_v);
+
+    EXPECT_EQ(output.get_value(), expected);
+
+    composer.print_num_gates();
+
+    auto prover = composer.create_prover();
+    auto verifier = composer.create_verifier();
+
+    auto proof = prover.construct_proof();
+
+    bool proof_result = verifier.verify_proof(proof);
+    EXPECT_EQ(proof_result, true);
+}