diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index a984dd9..f5200a1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -17,6 +17,25 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: cargo build --verbose
       - run: cargo test --verbose --all
+       
+  build_old:
+    name: cargo build and test (packed_simd)
+    strategy:
+      matrix:
+        # Needs big runners to run tests
+        # Only macos-13-xlarge is Apple Silicon, as per:
+        # https://docs.github.com/en/actions/using-github-hosted-runners/about-larger-runners/about-larger-runners#about-macos-larger-runners
+        os: [ubuntu-22.04-github-hosted-16core, macos-13-xlarge]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: nightly-2023-05-31 
+
+      # Still compile the old rust nightly with packed simd - until we have a good replacement in poseidon.
+      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 build --features include_packed_simd
+      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 test --features include_packed_simd
 
   formatting:
     name: cargo fmt
diff --git a/.gitignore b/.gitignore
index d54047a..2460227 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 /target
 /profiling-target/target
 /Cargo.lock
+
+.idea/
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 3d658ff..e44d1ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ itertools = "0.10"
 blake2 = "0.10"
 sha2 = "0.10"
 num-modular = "0.5.1"
-packed_simd = { version = "0.3.9" }
+packed_simd = { version = "0.3.9" , optional = true}
 pairing = { package = "pairing_ce", git = "https://github.com/matter-labs/pairing.git" }
 crypto-bigint = "0.5"
 convert_case = "*"
@@ -52,4 +52,11 @@ lto = "fat"
 opt-level = 3
 
 [features]
+# If enabled, logs will be using trace, if disabled, they will be printed to stdout.
 log_tracing = ["tracing"]
+# Currently packed_simd is no longer working with the newest nightly.
+# But we still keep it as a feature, as we didn't migrate all the code, and 
+# some people might want to use older rust nightly, to be able to gain some performance.
+include_packed_simd = ["packed_simd"]
+cr_paranoia_mode = []
+debug_track = []
\ No newline at end of file
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 6b48c00..a671fa6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,2 +1,2 @@
 [toolchain]
-channel = "nightly-2023-06-25"
+channel = "nightly-2024-05-07"
diff --git a/src/cs/implementations/setup.rs b/src/cs/implementations/setup.rs
index 104cd8e..979f816 100644
--- a/src/cs/implementations/setup.rs
+++ b/src/cs/implementations/setup.rs
@@ -127,8 +127,11 @@ impl<
         let required_rows = num_used_rows + 1;
         assert!(required_rows <= self.max_trace_len);
 
-        dbg!(required_rows);
-        dbg!(self.lookups_tables_total_len());
+        log!("Required rows {:?}", required_rows);
+        log!(
+            "lookup_tables_total_len = {}",
+            self.lookups_tables_total_len()
+        );
 
         let required_rows = std::cmp::max(required_rows, self.lookups_tables_total_len());
 
@@ -185,12 +188,27 @@ impl<
             .max()
             .unwrap_or(0);
 
-        dbg!(required_size);
-        dbg!(max_copiable_in_specialized_columns);
-        dbg!(max_witnesses_in_general_purpose_columns);
-        dbg!(max_witnesses_in_specialized_columns);
-        dbg!(max_constants_for_general_purpose_gates);
-        dbg!(max_in_column_for_specialized_gates);
+        log!("required size = {}", required_size);
+        log!(
+            "max_copiable_in_specialized_columns = {}",
+            max_copiable_in_specialized_columns
+        );
+        log!(
+            "max_witnesses_in_general_purpose_columns = {}",
+            max_witnesses_in_general_purpose_columns
+        );
+        log!(
+            "max_witnesses_in_specialized_columns = {}",
+            max_witnesses_in_specialized_columns
+        );
+        log!(
+            "max_constants_for_general_purpose_gates = {}",
+            max_constants_for_general_purpose_gates
+        );
+        log!(
+            "max_in_column_for_specialized_gates = {}",
+            max_in_column_for_specialized_gates
+        );
 
         assert!(max_constants_for_general_purpose_gates <= required_size);
 
@@ -312,7 +330,7 @@ impl<
             dst.resize(precise_required_size, F::ZERO);
         }
 
-        dbg!(precise_required_size);
+        log!("precise_required_size = {}", precise_required_size);
 
         self.max_trace_len = precise_required_size;
         finalization_hints.final_trace_len = precise_required_size;
@@ -729,7 +747,7 @@ impl<
 
         let extra_polys_for_selectors = number_of_constant_polys_for_general_purpose_gates
             - self.parameters.num_constant_columns;
-        dbg!(extra_polys_for_selectors);
+        log!("extra_polys_for_selector = {}", extra_polys_for_selectors);
 
         let quotient_degree_from_constraits = if max_constraint_contribution_degree > 0 {
             max_constraint_contribution_degree - 1
@@ -1037,7 +1055,7 @@ impl<
         let (constant_columns, selectors_placement, min_degree) =
             self.create_constant_setup_polys(worker);
 
-        dbg!(min_degree);
+        log!("min_degree = {}", min_degree);
 
         let (_, total_num_constants_for_gates_over_general_purpose_columns) =
             selectors_placement.compute_stats();
diff --git a/src/cs/traits/cs.rs b/src/cs/traits/cs.rs
index da3b51d..5d4d786 100644
--- a/src/cs/traits/cs.rs
+++ b/src/cs/traits/cs.rs
@@ -20,7 +20,7 @@ impl<'set, 'tgt: 'set, T: SmallField> DstBuffer<'set, 'tgt, T> {
                 *offset += 1;
             }
             DstBuffer::MutSliceIndirect(dst, debug_track, offset) => {
-                if cfg!(debug_track) && *debug_track {
+                if cfg!(feature = "debug_track") && *debug_track {
                     log!("   set out {} <- {}", *offset, value.as_raw_u64())
                 }
 
diff --git a/src/dag/guide.rs b/src/dag/guide.rs
index 0eba95b..47a6315 100644
--- a/src/dag/guide.rs
+++ b/src/dag/guide.rs
@@ -384,7 +384,7 @@ impl<'a, T: Copy + Debug, F: SmallField, Cfg: CSResolverConfig> GuideOrder<'a, T
             pos += span.buffer.len();
         }
 
-        if cfg!(cr_paranoia_mode) && self.guide.tracing {
+        if cfg!(feature = "cr_paranoia_mode") && self.guide.tracing {
             log!(
                 "Released span {}: {:?}",
                 self.guide.spans[0].id.0,
@@ -684,7 +684,7 @@ impl<T: Debug, F: SmallField, Cfg: CSResolverConfig> BufferGuide<T, F, Cfg> {
     }
 
     pub(crate) fn flush(&mut self) -> BufferGuideFinalization<'_, T, F, Cfg> {
-        if cfg!(cr_paranoia_mode) && self.tracing {
+        if cfg!(feature = "cr_paranoia_mode") && self.tracing {
             log!("CRG: flush.");
         }
 
diff --git a/src/dag/resolver_box.rs b/src/dag/resolver_box.rs
index 397d551..d0f7dd8 100644
--- a/src/dag/resolver_box.rs
+++ b/src/dag/resolver_box.rs
@@ -424,7 +424,7 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
         // Safety: This is the actual type of the provided function.
         let bound = resolver.resolve_fn::<Fn>();
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false {
             log!(
                 "Ivk: Ins [{}], Out [{}], Out-addr [{}], Thread [{}]",
                 resolver
@@ -448,7 +448,10 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
             )
         }
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
+            && debug_track
+            && false
+        {
             log!(
                 "Ivk: provided inputs:\n   - {:?}",
                 ins.iter().map(|x| x.as_raw_u64()).collect_vec()
@@ -457,7 +460,10 @@ pub(crate) fn invocation_binder<Fn, F: SmallField>(
 
         bound(ins, &mut DstBuffer::MutSliceIndirect(out, debug_track, 0));
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && debug_track && true {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
+            && debug_track
+            && true
+        {
             log!(
                 "Ivk: calculated outputs:\n   - {:?}",
                 out.iter().map(|x| x.as_raw_u64()).collect_vec()
diff --git a/src/dag/resolvers/mt/mod.rs b/src/dag/resolvers/mt/mod.rs
index cad5822..8de30f5 100644
--- a/src/dag/resolvers/mt/mod.rs
+++ b/src/dag/resolvers/mt/mod.rs
@@ -169,7 +169,7 @@ impl<V: SmallField, RS: ResolverSortingMode<V>, CFG: CSResolverConfig>
 
         let debug_track = vec![];
 
-        if cfg!(cr_paranoia_mode) || PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || PARANOIA {
             log!("Contains tracked keys {:?} ", debug_track);
         }
 
@@ -269,7 +269,7 @@ impl<V: SmallField, RS: ResolverSortingMode<V>, CFG: CSResolverConfig>
 
         self.sorter.write_sequence();
 
-        if cfg!(cr_paranoia_mode) || PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || PARANOIA {
             log!("CR {:?}", unsafe {
                 self.common.awaiters_broker.stats.u_deref()
             });
@@ -1487,7 +1487,7 @@ mod test {
 
         storage.wait_till_resolved();
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!("Test: total value result: \n   - {}", unsafe {
                 (*storage.common.values.get())
                     .variables
@@ -1509,7 +1509,7 @@ mod test {
                 let act = Place::from_variable(Variable::from_variable_index(ix as u64))
                     .to(|x| storage.get_value_unchecked(x));
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     log!("Test: per item value: ix {}, value {}", ix, act);
                 }
 
@@ -1542,7 +1542,7 @@ mod test {
 
         storage.wait_till_resolved();
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!("Test: total value result: \n   - {}", unsafe {
                 (*storage.common.values.get())
                     .variables
@@ -1564,7 +1564,7 @@ mod test {
                 let act = Place::from_variable(Variable::from_variable_index(ix as u64))
                     .to(|x| storage.get_value_unchecked(x));
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     log!("Test: per item value: ix {}, value {}", ix, act);
                 }
 
diff --git a/src/dag/resolvers/mt/registrar.rs b/src/dag/resolvers/mt/registrar.rs
index 3c43c6b..257703f 100644
--- a/src/dag/resolvers/mt/registrar.rs
+++ b/src/dag/resolvers/mt/registrar.rs
@@ -116,7 +116,7 @@ impl Registrar {
     }
 
     pub(crate) fn is_empty(&self) -> bool {
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             log!(
                 "CRR: total remaining resolvers: {}",
                 self.vars.values().map(|x| x.len()).sum::<usize>()
diff --git a/src/dag/resolvers/mt/resolution_window.rs b/src/dag/resolvers/mt/resolution_window.rs
index 9efe2cb..60d1d23 100644
--- a/src/dag/resolvers/mt/resolution_window.rs
+++ b/src/dag/resolvers/mt/resolution_window.rs
@@ -163,8 +163,12 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
             comms,
 
             track_list: Vec::new(),
-            execution_list: if cfg!(cr_paranoia_mode) { 1 << 26 } else { 0 }
-                .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))),
+            execution_list: if cfg!(feature = "cr_paranoia_mode") {
+                1 << 26
+            } else {
+                0
+            }
+            .to(|x| Vec::with_capacity(x).op(|v| v.resize(x, 0))),
             phantom: PhantomData,
         };
 
@@ -207,7 +211,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                 data[data_ix].push(order_ix.into(), task.order_info.value);
 
-                if cfg!(cr_paranoia_mode) {
+                if cfg!(feature = "cr_paranoia_mode") {
                     self.execution_list[order_ix] += 1;
 
                     if self.execution_list[order_ix] > 1 {
@@ -238,7 +242,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                 }
             }
 
-            if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true {
+            if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true {
                 log!("RW: Batch! {} tasks.", count);
             }
 
@@ -264,7 +268,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                 .for_each(|x| {
                     x.state = ResolverState::Done;
 
-                    if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+                    if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                         unsafe {
                             let r = self.common.resolvers.u_deref().get(x.order_info.value);
 
@@ -291,7 +295,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
                     }
                 });
 
-            if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+            if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                 if self
                     .exec_order_buffer
                     .iter()
@@ -343,7 +347,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                     drop(awaiters);
 
-                    if cfg!(cr_paranoia_mode) && count > 0 {
+                    if cfg!(feature = "cr_paranoia_mode") && count > 0 {
                         log!(
                             "RW: Shifted by {}, new range is: {}..{}, buffer len: {}",
                             count,
@@ -412,7 +416,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
                 self.stats.total_consumption = extend_to as u64;
 
-                if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) {
+                if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") {
                     log!(
                         "RW: Extended range by {}, new range {}..{}",
                         extend_to,
@@ -474,7 +478,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
             }
         }
 
-        if crate::dag::resolvers::mt::PARANOIA || cfg!(cr_paranoia_mode) {
+        if crate::dag::resolvers::mt::PARANOIA || cfg!(feature = "cr_paranoia_mode") {
             log!("[{:?}] RW: Exit conditions met.", std::time::Instant::now())
         }
 
@@ -484,7 +488,7 @@ impl<V: SmallField + 'static, T: TrackId + 'static, Cfg: RWConfig<T> + 'static>
 
         self.stats.total_time = start_instant.elapsed();
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!("CR {:#?}", self.stats);
             log!("CR {:#?}", unsafe { &*self.channel.stats.get() });
 
@@ -554,7 +558,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
                             // here, as this is an unsynchronizd access.
                             let resolver = this.common.resolvers.u_deref().get(*resolver_ix);
 
-                            if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+                            if cfg!(feature="cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
                                 std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                                     this.invoke(resolver, *order_ix);
 
@@ -590,7 +594,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
             });
         }
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!(
                 "{}\n{:#?}\n{:#?}",
                 std::thread::current().name().unwrap_or_default(),
@@ -629,7 +633,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
             .map(|x| {
                 let (vs, md) = self.common.values.u_deref().get_item_ref(*x);
 
-                if cfg!(cr_paranoia_mode) || true {
+                if cfg!(feature = "cr_paranoia_mode") || true {
                     if Cfg::ASSERT_TRACKED_VALUES {
                         assert!(md.is_tracked());
                     }
@@ -678,7 +682,7 @@ impl<V: SmallField, T: TrackId + 'static, Cfg: RWConfig<T>, const SIZE: usize>
 
         let mut track = false;
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             if let Some(x) = self
                 .debug_track
                 .iter()
@@ -831,7 +835,7 @@ impl LockStepChannel {
     fn execute(&self) {
         use std::sync::atomic::Ordering::*;
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && false {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && false {
             log!("RW: batch sent {:#?}", unsafe { self.data.u_deref() });
         }
 
diff --git a/src/dag/resolvers/mt/sorters/sorter_live.rs b/src/dag/resolvers/mt/sorters/sorter_live.rs
index c0c1298..6b1e423 100644
--- a/src/dag/resolvers/mt/sorters/sorter_live.rs
+++ b/src/dag/resolvers/mt/sorters/sorter_live.rs
@@ -191,7 +191,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter>
                 }
             }
 
-            if cfg!(cr_paranoia_mode) {
+            if cfg!(feature = "cr_paranoia_mode") {
                 // This ugly block checks that the calculated parallelism is
                 // correct. It's a bit slower than O(n^2). Also note, that it
                 // checks only the last 1050 items, so it's not a full check,
@@ -297,7 +297,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
     }
 
     fn set_value(&mut self, key: crate::cs::Place, value: F) {
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA)
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA)
             && self.debug_track.contains(&key)
             && false
         {
@@ -378,7 +378,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
 
         let mut hit = false;
 
-        if (cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA) && true {
+        if (cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA) && true {
             if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) {
                 log!("CR: added resolution with tracked input {:?}", x);
 
@@ -498,7 +498,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
         outputs: &[Place],
         added_at: RegistrationNum,
     ) -> Vec<ResolverIx> {
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             if let Some(x) = self.debug_track.iter().find(|x| inputs.contains(x)) {
                 log!("CR: internalized resolution with tracked input {:?}", x);
             }
@@ -519,7 +519,7 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
 
         let deps = inputs.iter().map(|x| &values.get_item_ref(*x).1);
 
-        if cfg!(cr_paranoia_mode) {
+        if cfg!(feature = "cr_paranoia_mode") {
             debug_assert!(
                 deps.clone().all(|x| { x.is_tracked() }),
                 "Attempting to internalize a resolution with an untracked input. All inputs must be tracked."
@@ -610,14 +610,14 @@ impl<F: SmallField, Cfg: CSResolverConfig, RW: ResolutionRecordWriter> ResolverS
         self.record.values_count = unsafe { self.common.values.u_deref().max_tracked + 1 } as usize;
         self.record.registrations_count = self.stats.registrations_added as usize;
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             log!(
                 "CR: Final order written. Order len {}",
                 self.common.exec_order.lock().unwrap().items.len()
             );
         }
 
-        if cfg!(cr_paranoia_mode) || crate::dag::resolvers::mt::PARANOIA {
+        if cfg!(feature = "cr_paranoia_mode") || crate::dag::resolvers::mt::PARANOIA {
             self.guide.stats.finalize();
 
             log!("CR {:?}", self.guide.stats);
diff --git a/src/field/goldilocks/arm_asm_impl.rs b/src/field/goldilocks/arm_asm_impl.rs
index 03399c4..369b881 100644
--- a/src/field/goldilocks/arm_asm_impl.rs
+++ b/src/field/goldilocks/arm_asm_impl.rs
@@ -2,8 +2,10 @@ use crate::cs::implementations::utils::precompute_twiddles_for_fft;
 use crate::cs::traits::GoodAllocator;
 use crate::field::{Field, PrimeField};
 use crate::worker::Worker;
-use packed_simd::shuffle;
+use std::intrinsics::simd::simd_shuffle;
 use std::ops::{Add, BitOr, Sub};
+use std::simd::cmp::{SimdPartialEq, SimdPartialOrd};
+use std::simd::{u64x4, u64x8};
 use std::usize;
 
 use super::GoldilocksField;
@@ -17,7 +19,7 @@ pub struct MixedGL(pub [GoldilocksField; 16]);
 // we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
 #[derive(Clone, Copy)]
 #[repr(C, align(64))]
-struct U64x4Holder([packed_simd::u64x4; 4]);
+struct U64x4Holder([u64x4; 4]);
 
 impl std::fmt::Debug for MixedGL {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -38,8 +40,8 @@ impl MixedGL {
     pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
     pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
     pub const EPSILON: u64 = (1 << 32) - 1;
-    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
-    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
+    pub const EPSILON_VECTOR: u64x4 = u64x4::from_array([Self::EPSILON; 4]);
+    pub const EPSILON_VECTOR_D: u64x8 = u64x8::from_array([Self::EPSILON; 8]);
 
     #[inline(always)]
     pub fn new() -> Self {
@@ -64,7 +66,7 @@ impl MixedGL {
         for i in 0..4 {
             let a = a_u64.0[i];
             let a_reduced = a.add(Self::EPSILON_VECTOR);
-            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = a_reduced.simd_lt(Self::EPSILON_VECTOR);
             let res = cmp.select(a_reduced, a);
 
             a_u64.0[i] = res;
@@ -108,13 +110,13 @@ impl MixedGL {
             let b = b_u64.0[i];
             //additional reduction over b
             let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR);
             let b = cmp.select(b_reduced, b);
             //a+b
             let sum = a.add(b);
             let sum_reduced = sum.add(Self::EPSILON_VECTOR);
-            let cmp0 = sum_reduced.lt(sum);
-            let cmp1 = sum.lt(a);
+            let cmp0 = sum_reduced.simd_lt(sum);
+            let cmp1 = sum.simd_lt(a);
             let reduce_flag = cmp0.bitor(cmp1);
             let res = reduce_flag.select(sum_reduced, sum);
 
@@ -139,12 +141,12 @@ impl MixedGL {
             let b = b_u64.0[i];
             //additional reduction over b
             let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR);
             let b = cmp.select(b_reduced, b);
             //a-b
             let diff = a.sub(b);
             let diff_reduced = diff.sub(Self::EPSILON_VECTOR);
-            let cmp = a.lt(b);
+            let cmp = a.simd_lt(b);
             let res = cmp.select(diff_reduced, diff);
 
             a_u64.0[i] = res;
@@ -159,27 +161,28 @@ impl MixedGL {
 
     pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]);
+
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 2, 4, 6, 8, 10, 12, 14] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [1u32, 3, 5, 7, 9, 11, 13, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 8, 1, 9, 2, 10, 3, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 12, 5, 13, 6, 14, 7, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -188,27 +191,27 @@ impl MixedGL {
 
     pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]);
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 4, 5, 8, 9, 12, 13] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [2u32, 3, 6, 7, 10, 11, 14, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 8, 9, 2, 3, 10, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 12, 13, 6, 7, 14, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -217,27 +220,27 @@ impl MixedGL {
 
     pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self {
         let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        let u: u64x8 = simd_shuffle(part1, part2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] });
+        let v: u64x8 = simd_shuffle(part1, part2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] });
         //additional reduction over v
         let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let v = cmp.select(v_reduced, v);
         // u + v
         let sum = u.add(v);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(u);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = u.sub(v);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
+        let cmp = u.simd_lt(v);
         let res2 = cmp.select(diff_reduced, diff);
 
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        let part1: u64x8 = simd_shuffle(res1, res2, const { [0u32, 1, 2, 3, 8, 9, 10, 11] });
+        let part2: u64x8 = simd_shuffle(res1, res2, const { [4u32, 5, 6, 7, 12, 13, 14, 15] });
 
         *self = MixedGL::from_u64x8_arrays([part1, part2]);
 
@@ -256,27 +259,27 @@ impl MixedGL {
 
         let u = std::slice::from_raw_parts_mut(this as *mut u64, 8);
         let v = std::slice::from_raw_parts_mut(other as *mut u64, 8);
-        let a = packed_simd::u64x8::from_slice_aligned(u);
-        let b = packed_simd::u64x8::from_slice_aligned(v);
+        let a = u64x8::from_slice(u);
+        let b = u64x8::from_slice(v);
         //additional reduction over b
         let b_reduced = b.add(Self::EPSILON_VECTOR_D);
-        let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D);
+        let cmp = b_reduced.simd_lt(Self::EPSILON_VECTOR_D);
         let b = cmp.select(b_reduced, b);
         // u + v
         let sum = a.add(b);
         let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(a);
+        let cmp0 = sum_reduced.simd_lt(sum);
+        let cmp1 = sum.simd_lt(a);
         let reduce_flag = cmp0.bitor(cmp1);
         let res1 = reduce_flag.select(sum_reduced, sum);
         // u - v
         let diff = a.sub(b);
         let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = a.lt(b);
+        let cmp = a.simd_lt(b);
         let res2 = cmp.select(diff_reduced, diff);
 
-        res1.write_to_slice_aligned(u);
-        res2.write_to_slice_aligned(v);
+        res1.copy_to_slice(u);
+        res2.copy_to_slice(v);
     }
 
     /// # Safety
@@ -323,7 +326,7 @@ impl MixedGL {
     }
 
     #[inline(always)]
-    pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] {
+    pub(crate) fn as_u64x8_arrays(input: &Self) -> [u64x8; 2] {
         // this preserves an alignment
         unsafe { std::mem::transmute(*input) }
     }
@@ -335,7 +338,7 @@ impl MixedGL {
     }
 
     #[inline(always)]
-    pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self {
+    pub(crate) unsafe fn from_u64x8_arrays(input: [u64x8; 2]) -> Self {
         // this preserves an alignment
         std::mem::transmute(input)
     }
@@ -412,8 +415,8 @@ impl crate::field::traits::field_like::PrimeFieldLike for MixedGL {
         for i in 0..4 {
             let a = a_u64.0[i];
 
-            let is_zero = a.eq(packed_simd::u64x4::splat(0));
-            let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a);
+            let is_zero = a.simd_eq(u64x4::splat(0));
+            let neg = u64x4::splat(Self::ORDER).sub(a);
             let res = is_zero.select(a, neg);
 
             a_u64.0[i] = res;
diff --git a/src/field/goldilocks/arm_asm_packed_impl.rs b/src/field/goldilocks/arm_asm_packed_impl.rs
new file mode 100644
index 0000000..03399c4
--- /dev/null
+++ b/src/field/goldilocks/arm_asm_packed_impl.rs
@@ -0,0 +1,858 @@
+use crate::cs::implementations::utils::precompute_twiddles_for_fft;
+use crate::cs::traits::GoodAllocator;
+use crate::field::{Field, PrimeField};
+use crate::worker::Worker;
+use packed_simd::shuffle;
+use std::ops::{Add, BitOr, Sub};
+use std::usize;
+
+use super::GoldilocksField;
+
+// we need max of an alignment of u64x4 and u64x8 in this implementation, so 64
+
+#[derive(PartialEq, Eq, Hash, Clone, Copy)]
+#[repr(C, align(64))]
+pub struct MixedGL(pub [GoldilocksField; 16]);
+
+// we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
+#[derive(Clone, Copy)]
+#[repr(C, align(64))]
+struct U64x4Holder([packed_simd::u64x4; 4]);
+
+impl std::fmt::Debug for MixedGL {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0)
+    }
+}
+
+impl std::fmt::Display for MixedGL {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self.0)
+    }
+}
+
+impl MixedGL {
+    pub const ORDER_BITS: usize = GoldilocksField::ORDER_BITS;
+    pub const ORDER: u64 = GoldilocksField::ORDER;
+    pub const TWO_ADICITY: usize = GoldilocksField::TWO_ADICITY;
+    pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
+    pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
+    pub const EPSILON: u64 = (1 << 32) - 1;
+    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
+    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
+
+    #[inline(always)]
+    pub fn new() -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+
+    #[inline(always)]
+    pub fn from_constant(value: GoldilocksField) -> Self {
+        Self([value; 16])
+    }
+
+    #[inline(always)]
+    pub fn from_array(value: [GoldilocksField; 16]) -> Self {
+        Self(value)
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    pub fn to_reduced(&mut self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let a_reduced = a.add(Self::EPSILON_VECTOR);
+            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
+            let res = cmp.select(a_reduced, a);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    pub fn mul_constant_assign(&'_ mut self, other: &GoldilocksField) -> &mut Self {
+        for i in 0..16 {
+            self.0[i].mul_assign(other);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn mul_assign_impl(&mut self, other: &Self) -> &mut Self {
+        for i in 0..16 {
+            self.0[i].mul_assign(&other.0[i]);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn add_assign_impl(&mut self, other: &Self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+        let b_u64 = Self::as_u64x4_arrays(other);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let b = b_u64.0[i];
+            //additional reduction over b
+            let b_reduced = b.add(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let b = cmp.select(b_reduced, b);
+            //a+b
+            let sum = a.add(b);
+            let sum_reduced = sum.add(Self::EPSILON_VECTOR);
+            let cmp0 = sum_reduced.lt(sum);
+            let cmp1 = sum.lt(a);
+            let reduce_flag = cmp0.bitor(cmp1);
+            let res = reduce_flag.select(sum_reduced, sum);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn sub_assign_impl(&'_ mut self, other: &Self) -> &mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+        let b_u64 = Self::as_u64x4_arrays(other);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+            let b = b_u64.0[i];
+            //additional reduction over b
+            let b_reduced = b.add(Self::EPSILON_VECTOR);
+            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
+            let b = cmp.select(b_reduced, b);
+            //a-b
+            let diff = a.sub(b);
+            let diff_reduced = diff.sub(Self::EPSILON_VECTOR);
+            let cmp = a.lt(b);
+            let res = cmp.select(diff_reduced, diff);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self {
+        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
+        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]);
+        let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]);
+        //additional reduction over v
+        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
+        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
+        let v = cmp.select(v_reduced, v);
+        // u + v
+        let sum = u.add(v);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(u);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = u.sub(v);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = u.lt(v);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]);
+        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]);
+
+        *self = MixedGL::from_u64x8_arrays([part1, part2]);
+
+        self
+    }
+
+    /// # Safety
+    ///
+    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 8, and should point
+    /// to memory that can be mutated.
+    /// No references to the same memory should exist when this function is called.
+    /// Pointers should be different.
+    pub unsafe fn butterfly_8x8_impl(this: *const u64, other: *const u64) {
+        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
+        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+        let u = std::slice::from_raw_parts_mut(this as *mut u64, 8);
+        let v = std::slice::from_raw_parts_mut(other as *mut u64, 8);
+        let a = packed_simd::u64x8::from_slice_aligned(u);
+        let b = packed_simd::u64x8::from_slice_aligned(v);
+        //additional reduction over b
+        let b_reduced = b.add(Self::EPSILON_VECTOR_D);
+        let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D);
+        let b = cmp.select(b_reduced, b);
+        // u + v
+        let sum = a.add(b);
+        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
+        let cmp0 = sum_reduced.lt(sum);
+        let cmp1 = sum.lt(a);
+        let reduce_flag = cmp0.bitor(cmp1);
+        let res1 = reduce_flag.select(sum_reduced, sum);
+        // u - v
+        let diff = a.sub(b);
+        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
+        let cmp = a.lt(b);
+        let res2 = cmp.select(diff_reduced, diff);
+
+        res1.write_to_slice_aligned(u);
+        res2.write_to_slice_aligned(v);
+    }
+
+    /// # Safety
+    ///
+    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 16, and should point
+    /// to memory that can be mutated.
+    /// No references to the same memory should exist when this function is called.
+    /// Pointers should be different.
+    pub unsafe fn butterfly_16x16_impl(mut this: *mut u64, mut other: *mut u64) {
+        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
+        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+        Self::butterfly_8x8_impl(this, other);
+        this = this.offset(8);
+        other = other.offset(8);
+        Self::butterfly_8x8_impl(this, other);
+    }
+
+    // pub unsafe fn butterfly_16x16_impl(
+    //     this: &mut Self,
+    //     other: &mut Self,
+    // ) {
+    //     let mut this_ptr = this.0.as_ptr() as *mut u64;
+    //     let mut other_ptr = other.0.as_ptr() as *mut u64;
+
+    //     debug_assert!(this_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
+    //     debug_assert!(other_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
+
+    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
+    //     this_ptr = this_ptr.offset(8);
+    //     other_ptr = other_ptr.offset(8);
+    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
+    // }
+
+    #[inline(always)]
+    pub fn from_field_array(input: [GoldilocksField; 16]) -> Self {
+        Self(input)
+    }
+
+    #[inline(always)]
+    fn as_u64x4_arrays(input: &Self) -> U64x4Holder {
+        // this preserves an alignment
+        unsafe { std::mem::transmute(*input) }
+    }
+
+    #[inline(always)]
+    pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] {
+        // this preserves an alignment
+        unsafe { std::mem::transmute(*input) }
+    }
+
+    #[inline(always)]
+    unsafe fn from_u64x4_arrays(input: U64x4Holder) -> Self {
+        // this preserves an alignment
+        std::mem::transmute(input)
+    }
+
+    #[inline(always)]
+    pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self {
+        // this preserves an alignment
+        std::mem::transmute(input)
+    }
+
+    #[inline(always)]
+    pub fn vec_add_assign(a: &mut [Self], b: &[Self]) {
+        use crate::field::traits::field_like::PrimeFieldLike;
+        for (a, b) in a.iter_mut().zip(b.iter()) {
+            a.add_assign(b, &mut ());
+        }
+    }
+
+    #[inline(always)]
+    pub fn vec_mul_assign(a: &mut [Self], b: &[Self]) {
+        use crate::field::traits::field_like::PrimeFieldLike;
+        for (a, b) in a.iter_mut().zip(b.iter()) {
+            a.mul_assign(b, &mut ());
+        }
+    }
+}
+
+impl Default for MixedGL {
+    fn default() -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+}
+
+impl crate::field::traits::field_like::PrimeFieldLike for MixedGL {
+    type Base = GoldilocksField;
+    type Context = ();
+
+    #[inline(always)]
+    fn zero(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::ZERO; 16])
+    }
+    #[inline(always)]
+    fn one(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::ONE; 16])
+    }
+    #[inline(always)]
+    fn minus_one(_ctx: &mut Self::Context) -> Self {
+        Self([GoldilocksField::MINUS_ONE; 16])
+    }
+
+    #[inline(always)]
+    fn add_assign(&mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::add_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    fn sub_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::sub_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn mul_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
+        Self::mul_assign_impl(self, other)
+    }
+
+    #[inline(always)]
+    fn square(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let t = *self;
+        self.mul_assign(&t, _ctx);
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn negate(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let mut a_u64 = Self::as_u64x4_arrays(self);
+
+        for i in 0..4 {
+            let a = a_u64.0[i];
+
+            let is_zero = a.eq(packed_simd::u64x4::splat(0));
+            let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a);
+            let res = is_zero.select(a, neg);
+
+            a_u64.0[i] = res;
+        }
+
+        unsafe {
+            *self = Self::from_u64x4_arrays(a_u64);
+        }
+
+        self
+    }
+
+    #[inline(always)]
+    fn double(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
+        let t = *self;
+        self.add_assign(&t, _ctx);
+
+        self
+    }
+
+    #[inline(always)]
+    #[unroll::unroll_for_loops]
+    fn inverse(&self, _ctx: &mut Self::Context) -> Self {
+        let mut result = *self;
+        for i in 0..16 {
+            result.0[i] = PrimeField::inverse(&result.0[i]).expect("inverse must exist");
+        }
+
+        result
+    }
+
+    #[inline(always)]
+    fn constant(value: Self::Base, _ctx: &mut Self::Context) -> Self {
+        Self([value; 16])
+    }
+}
+
+impl crate::field::traits::field_like::PrimeFieldLikeVectorized for MixedGL {
+    type Twiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
+    type InverseTwiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
+    #[inline(always)]
+    fn is_zero(&self) -> bool {
+        self.0 == [GoldilocksField::ZERO; 16]
+    }
+
+    #[inline(always)]
+    fn equals(&self, other: &Self) -> bool {
+        self.eq(other)
+    }
+
+    #[inline(always)]
+    fn mul_all_by_base(&'_ mut self, other: &Self::Base, _ctx: &mut Self::Context) -> &'_ mut Self {
+        Self::mul_constant_assign(self, other)
+    }
+
+    #[inline(always)]
+    fn slice_from_base_slice(input: &[Self::Base]) -> &[Self] {
+        if input.len() < Self::SIZE_FACTOR {
+            panic!("too small input size to cast");
+        }
+        debug_assert!(input.len() % Self::SIZE_FACTOR == 0);
+        debug_assert!(input.as_ptr().addr() % std::mem::align_of::<Self>() == 0);
+        let result_len = input.len() / 16;
+        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut Self, result_len) }
+    }
+
+    #[inline(always)]
+    fn slice_into_base_slice(input: &[Self]) -> &[Self::Base] {
+        let result_len = input.len() * 16;
+        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut GoldilocksField, result_len) }
+    }
+
+    #[inline(always)]
+    fn slice_into_base_slice_mut(input: &mut [Self]) -> &mut [Self::Base] {
+        let result_len = input.len() * 16;
+        unsafe {
+            std::slice::from_raw_parts_mut(input.as_ptr() as *mut GoldilocksField, result_len)
+        }
+    }
+
+    #[inline(always)]
+    fn vec_from_base_vec<A: GoodAllocator>(input: Vec<Self::Base, A>) -> Vec<Self, A> {
+        if input.len() < Self::SIZE_FACTOR {
+            panic!("too small input size to cast");
+        }
+        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
+        debug_assert!(ptr.addr() % std::mem::align_of::<Self>() == 0);
+        debug_assert!(len % Self::SIZE_FACTOR == 0);
+        debug_assert!(capacity % Self::SIZE_FACTOR == 0);
+
+        unsafe {
+            Vec::from_raw_parts_in(
+                ptr as _,
+                len / Self::SIZE_FACTOR,
+                capacity / Self::SIZE_FACTOR,
+                allocator,
+            )
+        }
+    }
+
+    #[inline(always)]
+    fn vec_into_base_vec<A: GoodAllocator>(input: Vec<Self, A>) -> Vec<Self::Base, A> {
+        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
+
+        unsafe {
+            Vec::from_raw_parts_in(
+                ptr as _,
+                len * Self::SIZE_FACTOR,
+                capacity * Self::SIZE_FACTOR,
+                allocator,
+            )
+        }
+    }
+
+    #[inline(always)]
+    fn fft_natural_to_bitreversed<A: GoodAllocator>(
+        input: &mut [Self],
+        coset: Self::Base,
+        twiddles: &Self::Twiddles<A>,
+        _ctx: &mut Self::Context,
+    ) {
+        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
+        // crate::fft::fft_natural_to_bitreversed_cache_friendly(input, coset, twiddles);
+
+        crate::fft::fft_natural_to_bitreversed_mixedgl(input, coset, twiddles);
+    }
+
+    #[inline(always)]
+    fn ifft_natural_to_natural<A: GoodAllocator>(
+        input: &mut [Self],
+        coset: Self::Base,
+        twiddles: &Self::InverseTwiddles<A>,
+        _ctx: &mut Self::Context,
+    ) {
+        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
+        // crate::fft::ifft_natural_to_natural_cache_friendly(input, coset, twiddles);
+
+        crate::fft::ifft_natural_to_natural_mixedgl(input, coset, twiddles);
+    }
+
+    #[inline(always)]
+    fn precompute_forward_twiddles_for_fft<A: GoodAllocator>(
+        fft_size: usize,
+        worker: &Worker,
+        ctx: &mut Self::Context,
+    ) -> Self::Twiddles<A> {
+        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, false>(
+            fft_size, worker, ctx,
+        )
+    }
+
+    #[inline(always)]
+    fn precompute_inverse_twiddles_for_fft<A: GoodAllocator>(
+        fft_size: usize,
+        worker: &Worker,
+        ctx: &mut Self::Context,
+    ) -> Self::Twiddles<A> {
+        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, true>(
+            fft_size, worker, ctx,
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+
+    use crate::field::goldilocks::MixedGL;
+    use crate::field::rand_from_rng;
+    use crate::field::traits::field_like::PrimeFieldLike;
+    use crate::field::traits::field_like::PrimeFieldLikeVectorized;
+    use crate::field::{goldilocks::GoldilocksField, Field};
+    use crate::utils::clone_respecting_allignment;
+
+    #[test]
+    fn test_mixedgl_negate() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let mut rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+
+        let mut ag = a.clone();
+
+        for aa in ag.iter_mut() {
+            Field::negate(aa);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+
+        // Test over GLPS
+        for aa in av.iter_mut() {
+            aa.negate(&mut ctx);
+        }
+
+        assert_eq!(MixedGL::vec_into_base_vec(av), ag);
+    }
+
+    use rand::Rng;
+
+    #[test]
+    fn test_mixedgl_add_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 24;
+        let mut rng = rand::thread_rng();
+        let _s = GoldilocksField(0x0000000001000000);
+
+        // Generate random Vec<GoldilocksField>
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000000000001)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000001000000)).collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
+            .collect();
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
+            .collect();
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xfffffffff67f1442)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xffffffff9c1d065d)).collect();
+
+        // dbg!(&a);
+        // dbg!(&b);
+
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::add_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.add_assign(bb, &mut ctx);
+        }
+
+        let avv = MixedGL::vec_into_base_vec(av);
+        // for i in 0..avv.len() {
+        //     assert_eq!(avv[i], ag[i], "error {}", i);
+        // }
+
+        // dbg!(&ag[0]);
+        // dbg!(&avv[0]);
+
+        assert_eq!(avv, ag);
+    }
+
+    #[test]
+    fn test_mixedgl_sub_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let _rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(0x0000000000000001))
+            .collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
+            .map(|_| GoldilocksField(0x0000000001000000))
+            .collect();
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::sub_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.sub_assign(bb, &mut ctx);
+        }
+
+        // dbg!(&ag);
+        // dbg!(&av);
+
+        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
+    }
+
+    #[test]
+    fn test_mixedgl_mul_assign() {
+        let mut ctx = ();
+        const POLY_SIZE: usize = 1 << 20;
+        let mut rng = rand::thread_rng();
+
+        // Generate random Vec<GoldilocksField>
+        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+        let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        let bg = b.clone();
+
+        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
+            Field::mul_assign(aa, bb);
+        }
+
+        let mut av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        let bv: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &b,
+            ));
+
+        // Test over GLPS
+        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
+            aa.mul_assign(bb, &mut ctx);
+        }
+
+        // dbg!(&ag);
+        // dbg!(&av);
+
+        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
+    }
+
+    #[test]
+    fn test_mixedgl_butterfly16x16() {
+        // let mut ctx = ();
+
+        // let am: [u64;32] = [0x0001000000000000, 0x0000000000000001, 0x0001000000000000, 0x0000000000000001, 0x0000000000000000, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0001000000000000, 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xfffeffff00000001, 0xfffeffff00000002, 0xfffeffff00000002,
+        //     0x0000000000000000, 0x0000000000000001, 0x0000000000000000, 0x0001000000000001, 0xfffeffff00000001, 0xffffffff00000000, 0x0001000000000000, 0xfffeffff00000002, 0x0000000000000000, 0xfffeffff00000001, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0000000000000001, 0x0001000000000000];
+
+        let am: [u64; 32] = [
+            0x0001000000000000,
+            0x0000000000000001,
+            0x0001000000000000,
+            0x0000000000000001,
+            0x0000000000000000,
+            0xffffffff00000000,
+            0x0000000000000001,
+            0x0000ffffffffffff,
+            0x0000000000000000,
+            0x0001000000000000,
+            0xffffffff00000000,
+            0xffffffff00000000,
+            0xffffffff00000000,
+            0xfffeffff00000001,
+            0xfffeffff00000002,
+            0xfffeffff00000002,
+            0x0000000000000000,
+            0xffffffff01000001,
+            0x0000000000000000,
+            0x0000010000ffff00,
+            0xfffffeff00000101,
+            0xfffffffeff000001,
+            0x000000ffffffff00,
+            0xfffffeff01000101,
+            0x0000000000000000,
+            0xfffffeff00000101,
+            0xfffffffeff000001,
+            0xffffffff01000001,
+            0x000000fffeffff00,
+            0x0000000000000000,
+            0xffffffff01000001,
+            0x000000ffffffff00,
+        ];
+
+        let a: Vec<GoldilocksField> = am.into_iter().map(GoldilocksField).collect();
+        // let b: Vec<GoldilocksField> = bm.into_iter().map(GoldilocksField).collect();
+        let _s = GoldilocksField(0x0000000001000000);
+
+        // Test over Goldilocks
+        let mut ag = a.clone();
+        // let mut bg = b.clone();
+        let distance_in_cache = 16;
+
+        let mut j = 0;
+        while j < 16 {
+            let mut u = ag[j];
+            let v = ag[j + distance_in_cache];
+            // Field::mul_assign(&mut v, &s);
+            Field::sub_assign(&mut u, &v);
+            ag[j + distance_in_cache] = u;
+            Field::add_assign(&mut ag[j], &v);
+
+            j += 1;
+        }
+
+        let av: Vec<MixedGL> =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &a,
+            ));
+        // let mut bv: Vec<MixedGL> = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&b));
+        // let mut av = av[0];
+        // let mut bv = bv[0];
+
+        // Test over MixedGL
+        // av[1].mul_constant_assign(&s);
+        unsafe {
+            MixedGL::butterfly_16x16_impl(
+                av[0].0.as_ptr() as *mut u64,
+                av[1].0.as_ptr() as *mut u64,
+            );
+        }
+        // let mut u = av[0];
+        // let mut v = av[1];
+        // unsafe { MixedGL::butterfly_16x16_impl(&mut u, &mut v); }
+        // av[0] = u;
+        // av[1] = v;
+
+        let ag =
+            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
+                &ag,
+            ));
+        // let bg = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&bg));
+
+        dbg!(&ag);
+        dbg!(&av);
+
+        // dbg!(&bg);
+        // dbg!(&bv);
+
+        assert_eq!(ag, av);
+        // assert_eq!(bg, bv);
+    }
+}
diff --git a/src/field/goldilocks/mod.rs b/src/field/goldilocks/mod.rs
index 10daec1..82fa6be 100644
--- a/src/field/goldilocks/mod.rs
+++ b/src/field/goldilocks/mod.rs
@@ -12,10 +12,18 @@ mod extension;
 mod inversion;
 
 #[cfg(all(
+    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub mod arm_asm_impl;
+
+#[cfg(all(
+    feature = "include_packed_simd",
+    any(target_feature = "neon", target_feature = "avx2"),
+    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
+))]
+pub mod arm_asm_packed_impl;
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
@@ -43,10 +51,19 @@ pub mod x86_64_asm_impl;
 pub mod avx512_impl;
 
 #[cfg(all(
+    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub use arm_asm_impl::*;
+
+#[cfg(all(
+    feature = "include_packed_simd",
+    any(target_feature = "neon", target_feature = "avx2"),
+    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
+))]
+pub use arm_asm_packed_impl::*;
+
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
diff --git a/src/gadgets/curves/.gitignore b/src/gadgets/curves/.gitignore
new file mode 100644
index 0000000..953d618
--- /dev/null
+++ b/src/gadgets/curves/.gitignore
@@ -0,0 +1,3 @@
+sage/*.sage.py
+bn256/*.sage.py
+.ipynb_checkpoints
\ No newline at end of file
diff --git a/src/gadgets/curves/sw_projective/extended.rs b/src/gadgets/curves/sw_projective/extended.rs
new file mode 100644
index 0000000..549691b
--- /dev/null
+++ b/src/gadgets/curves/sw_projective/extended.rs
@@ -0,0 +1,583 @@
+// Short weierstrass projective curve point implementation.
+// Primarily based on the paper: https://eprint.iacr.org/2015/1060.pdf
+
+use self::curves::non_native_field::traits::CurveCompatibleNonNativeField;
+
+use super::*;
+
+use crate::gadgets::traits::selectable::Selectable;
+use crate::{cs::traits::cs::ConstraintSystem, gadgets::boolean::Boolean};
+use pairing::ff::PrimeField;
+use pairing::GenericCurveAffine;
+
+/// ExtendedSWProjectivePoint is the same structure as SWProjectivePoint, but with an additional
+/// feature where GenericCurveAffine::Base is not necessarily the PrimeField. This structure
+/// is done separately to avoid any potential conflicts with the existing SWProjectivePoint.
+#[derive(Derivative)]
+#[derivative(Clone, Debug)]
+pub struct ExtendedSWProjectivePoint<F, T, C, NN>
+where
+    F: SmallField,
+    T: PrimeField,
+    C: GenericCurveAffine,
+    NN: CurveCompatibleNonNativeField<F, T, C>,
+{
+    pub x: NN,
+    pub y: NN,
+    pub z: NN,
+    pub _marker: std::marker::PhantomData<(F, T, C)>,
+}
+
+impl<F, T, C, NN> ExtendedSWProjectivePoint<F, T, C, NN>
+where
+    F: SmallField,
+    T: PrimeField,
+    C: GenericCurveAffine,
+    NN: CurveCompatibleNonNativeField<F, T, C>,
+{
+    pub fn from_xy_unchecked<CS: ConstraintSystem<F>>(cs: &mut CS, x: NN, y: NN) -> Self {
+        let params = x.get_params();
+        let z = NN::allocated_constant(cs, T::one(), params);
+
+        Self {
+            x,
+            y,
+            z,
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    /// Checks whether the Z coordinate is zero or not.
+    pub fn is_normalized<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Boolean<F> {
+        let mut one = NN::allocated_constant(cs, T::one(), self.x.get_params());
+        self.z.equals(cs, &mut one)
+    }
+
+    pub fn zero<CS: ConstraintSystem<F>>(cs: &mut CS, params: &std::sync::Arc<NN::Params>) -> Self {
+        let x = NN::allocated_constant(cs, T::zero(), params);
+        let y = NN::allocated_constant(cs, T::one(), params);
+        let z = NN::allocated_constant(cs, T::zero(), params);
+
+        Self {
+            x,
+            y,
+            z,
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    pub fn one<CS: ConstraintSystem<F>>(cs: &mut CS, params: &std::sync::Arc<NN::Params>) -> Self {
+        use pairing::ff::Field;
+
+        let one = C::one();
+        let (x, y) = one.into_xy_unchecked();
+        let x = NN::from_curve_base(cs, &x, params);
+        let y = NN::from_curve_base(cs, &y, params);
+        let z = NN::from_curve_base(cs, &C::Base::one(), params);
+
+        Self {
+            x,
+            y,
+            z,
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    pub fn double<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
+        use pairing::ff::Field;
+        if C::a_coeff().is_zero() == false {
+            return self.generic_double(cs);
+        }
+        let params = self.x.get_params().clone();
+
+        let mut three = T::one();
+        three.double();
+        three.add_assign(&T::one());
+
+        let mut four = T::one();
+        four.double();
+        four.double();
+
+        let mut curve_b = NN::from_curve_base(cs, &C::b_coeff(), &params);
+        let mut curve_b3 = curve_b.double(cs);
+        let mut curve_b3 = curve_b3.add(cs, &mut curve_b);
+
+        let mut three_nn = NN::allocated_constant(cs, three, &params);
+        let mut four_nn = NN::allocated_constant(cs, four, &params);
+
+        let x = &mut self.x;
+        let y = &mut self.y;
+        let z = &mut self.z;
+
+        // t0 = y * y
+        let mut t0 = y.square(cs);
+        // t2 = b3 * z * z
+        let mut b3_mul_z = z.mul(cs, &mut curve_b3);
+        let mut t2 = b3_mul_z.mul(cs, z);
+        // y3 = t0 + t2
+        let mut y3: NN = t0.add(cs, &mut t2);
+        // t1 = y * z
+        let mut t1 = y.mul(cs, z);
+        // z3 = 8 * t0 * t1
+        let mut t0_mul_4 = t0.mul(cs, &mut four_nn);
+        let mut t0_mul_8 = t0_mul_4.double(cs);
+        let z3 = t0_mul_8.mul(cs, &mut t1);
+        // t4 = 4 * t0 - 3 * y3
+        let mut y3_mul_3 = y3.mul(cs, &mut three_nn);
+        let mut t4 = t0_mul_4.sub(cs, &mut y3_mul_3);
+        // y3 = t4 * y3
+        let mut y3 = t4.mul(cs, &mut y3);
+        // y3 = 8 * t0 * t2 + y3
+        let mut new_y3 = t0_mul_8.mul(cs, &mut t2);
+        let new_y3 = new_y3.add(cs, &mut y3);
+        let y3 = new_y3;
+        // t1 = x * y
+        let mut t1 = x.mul(cs, y);
+        // x3 = 2 * t4 * t1
+        let mut t4_mul_2 = t4.double(cs);
+        let x3 = t4_mul_2.mul(cs, &mut t1);
+
+        let new = Self {
+            x: x3,
+            y: y3,
+            z: z3,
+            _marker: std::marker::PhantomData,
+        };
+
+        new
+    }
+
+    fn generic_double<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
+        use pairing::ff::Field;
+        let params = self.x.get_params().clone();
+
+        let curve_b = C::b_coeff();
+        let mut curve_b3 = curve_b;
+        curve_b3.double();
+        curve_b3.add_assign(&curve_b);
+
+        let mut curve_a = NN::from_curve_base(cs, &C::a_coeff(), &params);
+        let mut curve_b3 = NN::from_curve_base(cs, &curve_b3, &params);
+
+        let x = &mut self.x;
+        let y = &mut self.y;
+        let z = &mut self.z;
+
+        // t0 = x * x
+        let mut t0 = x.square(cs);
+        // t1 = y * y
+        let mut t1 = y.square(cs);
+        // t2 = z * z
+        let mut t2 = z.square(cs);
+
+        // t3 = x * y
+        let mut t3 = x.mul(cs, y);
+        // t3 = t3 + t3
+        let mut t3 = t3.double(cs);
+        // z3 = x * z
+        let mut z3 = x.mul(cs, z);
+
+        // z3 = z3 + z3
+        let mut z3 = z3.double(cs);
+        // x3 = a * z3
+        let mut x3 = curve_a.mul(cs, &mut z3);
+        // y3 = b3 * t2
+        let mut y3 = curve_b3.mul(cs, &mut t2);
+
+        // y3 = x3 + y3
+        let mut y3 = x3.add(cs, &mut y3);
+        // x3 = t1 - y3
+        let mut x3 = t1.sub(cs, &mut y3);
+        // y3 = t1 + y3
+        let mut y3 = t1.add(cs, &mut y3);
+
+        // y3 = x3 * y3
+        let mut y3 = x3.mul(cs, &mut y3);
+        // x3 = t3 * x3
+        let mut x3 = t3.mul(cs, &mut x3);
+        // z3 = b3 * z3
+        let mut z3 = curve_b3.mul(cs, &mut z3);
+
+        // t2 = a * t2
+        let mut t2 = curve_a.mul(cs, &mut t2);
+        // t3 = t0 - t2
+        let mut t3 = t0.sub(cs, &mut t2);
+        // t3 = a * t3
+        let mut t3 = curve_a.mul(cs, &mut t3);
+
+        // t3 = t3 + z3
+        let mut t3 = t3.add(cs, &mut z3);
+        // z3 = t0 + t0
+        let mut z3 = t0.double(cs);
+        // t0 = z3 + t0
+        let mut t0 = z3.add(cs, &mut t0);
+
+        // t0 = t0 + t2
+        let mut t0 = t0.add(cs, &mut t2);
+        // t0 = t0 * t3
+        let mut t0 = t0.mul(cs, &mut t3);
+        // y3 = y3 + y0
+        let y3 = y3.add(cs, &mut t0);
+
+        // t2 = y * z
+        let mut t2 = y.mul(cs, z);
+        // t2 = t2 + t2
+        let mut t2 = t2.double(cs);
+        // t0 = t2 * t3
+        let mut t0 = t2.mul(cs, &mut t3);
+
+        // x3 = x3 - t0
+        let x3 = x3.sub(cs, &mut t0);
+        // z3 = t2 * t1
+        let mut z3 = t2.mul(cs, &mut t1);
+        // z3 = z3 + z3
+        let mut z3 = z3.double(cs);
+
+        // z3 = z3 + z3
+        let z3 = z3.double(cs);
+
+        let new = Self {
+            x: x3,
+            y: y3,
+            z: z3,
+            _marker: std::marker::PhantomData,
+        };
+
+        new
+    }
+
+    pub fn negated<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
+        let y_negated = self.y.negated(cs);
+
+        let new = Self {
+            x: self.x.clone(),
+            y: y_negated,
+            z: self.z.clone(),
+            _marker: std::marker::PhantomData,
+        };
+
+        new
+    }
+
+    fn add_sub_mixed_impl<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        other_xy: &mut (NN, NN),
+        is_subtraction: bool,
+    ) -> Self {
+        use pairing::ff::Field;
+        if C::a_coeff().is_zero() == false {
+            return self.generic_add_sub_mixed_impl(cs, other_xy, is_subtraction);
+        }
+
+        let params = self.x.get_params().clone();
+
+        let mut three = T::one();
+        three.double();
+        three.add_assign(&T::one());
+
+        let curve_b = C::b_coeff();
+        let mut curve_b3 = curve_b;
+        curve_b3.double();
+        curve_b3.add_assign(&curve_b);
+
+        let mut curve_b6 = curve_b3;
+        curve_b6.double();
+
+        let mut three_nn = NN::allocated_constant(cs, three, &params);
+        let mut curve_b3 = NN::from_curve_base(cs, &curve_b3, &params);
+        let mut curve_b6 = NN::from_curve_base(cs, &curve_b6, &params);
+
+        let x1 = &mut self.x;
+        let y1 = &mut self.y;
+        let z1 = &mut self.z;
+
+        let mut y2_local: NN = other_xy.1.clone();
+        let x2 = &mut other_xy.0;
+        if is_subtraction {
+            y2_local = y2_local.negated(cs);
+        }
+        let y2 = &mut y2_local;
+
+        // t4 = y2 * z1 + y1
+        let mut t4 = y2.mul(cs, z1);
+        let mut t4 = t4.add(cs, y1);
+
+        // y3 = x2 * z1 + x1
+        let mut y3 = x2.mul(cs, z1);
+        let mut y3 = y3.add(cs, x1);
+
+        // z3 = y1 * y2 + b3 * z1
+        let mut z1_mul_b3 = z1.mul(cs, &mut curve_b3);
+        let mut z3 = y1.mul(cs, y2);
+        let mut z3 = z3.add(cs, &mut z1_mul_b3);
+
+        // t0 = x1 * x2
+        let mut t0 = x1.mul(cs, x2);
+
+        // t3 = (x2 + y2) * (x1 + y1) - t0 - z3 + b3 * z1
+        let mut a = x2.add(cs, y2);
+        let mut b = x1.add(cs, y1);
+        let mut t3 = a.mul(cs, &mut b);
+        let mut t3 = t3.sub(cs, &mut t0);
+        let mut t3 = t3.sub(cs, &mut z3);
+        let mut t3 = t3.add(cs, &mut z1_mul_b3);
+
+        // x3 = t4 * b3 * y3
+        let mut y3_mul_b3 = y3.mul(cs, &mut curve_b3);
+        let mut x3 = t4.mul(cs, &mut y3_mul_b3);
+
+        // t1 = z3 - 2 * b3 * z1
+        let mut z1_mul_2_b3 = z1.mul(cs, &mut curve_b6);
+        let mut t1 = z3.sub(cs, &mut z1_mul_2_b3);
+
+        // x3 = t3 * t1 - x3
+        let mut new_x3 = t3.mul(cs, &mut t1);
+        let new_x3 = new_x3.sub(cs, &mut x3);
+        let x3 = new_x3;
+
+        // y3 = (b3 * y3) * (3 * t0)
+        let mut t0_mul_3 = t0.mul(cs, &mut three_nn);
+        let mut y3 = y3_mul_b3.mul(cs, &mut t0_mul_3);
+
+        // y3 = t1 * z3  + y3
+        let mut new_y3 = t1.mul(cs, &mut z3);
+        let new_y3 = new_y3.add(cs, &mut y3);
+        let y3 = new_y3;
+
+        // t0 = (3 * t0) * t3
+        let mut t0 = t0_mul_3.mul(cs, &mut t3);
+
+        // z3 = z3 * t4 + t0
+        let mut z3 = z3.mul(cs, &mut t4);
+        let z3 = z3.add(cs, &mut t0);
+
+        let new = Self {
+            x: x3,
+            y: y3,
+            z: z3,
+            _marker: std::marker::PhantomData,
+        };
+
+        new
+    }
+
+    fn generic_add_sub_mixed_impl<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        other_xy: &mut (NN, NN),
+        is_subtraction: bool,
+    ) -> Self {
+        use pairing::ff::Field;
+        let params = self.x.get_params().clone();
+
+        let curve_b = C::b_coeff();
+        let mut curve_b3 = curve_b;
+        curve_b3.double();
+        curve_b3.add_assign(&curve_b);
+
+        let mut curve_a = NN::from_curve_base(cs, &C::a_coeff(), &params);
+        let mut curve_b3 = NN::from_curve_base(cs, &curve_b3, &params);
+
+        let x1 = &mut self.x;
+        let y1 = &mut self.y;
+        let z1 = &mut self.z;
+
+        let mut y2_local: NN = other_xy.1.clone();
+        let x2 = &mut other_xy.0;
+        if is_subtraction {
+            y2_local = y2_local.negated(cs);
+        }
+        let y2 = &mut y2_local;
+
+        // t0 = x1 * x2
+        let mut t0 = x1.mul(cs, x2);
+        // t1 = x1 * y2
+        let mut t1 = y1.mul(cs, y2);
+        // t3 = x2 + y2
+        let mut t3 = x2.add(cs, y2);
+
+        // t4 = x1 + y1
+        let mut t4 = x1.add(cs, y1);
+        // t3 = t3 * t4
+        let mut t3 = t3.mul(cs, &mut t4);
+        // t4 = t0 + t1
+        let mut t4 = t0.add(cs, &mut t1);
+
+        // t3 = t3 - t4
+        let mut t3 = t3.sub(cs, &mut t4);
+        // t4 = x2 * z1
+        let mut t4 = x2.mul(cs, z1);
+        // t4 = t4 + x1
+        let mut t4 = t4.add(cs, x1);
+
+        // t5 = y2 * z1
+        let mut t5 = y2.mul(cs, z1);
+        // t5 = t5 + y1
+        let mut t5 = t5.add(cs, y1);
+        // z3 = a * t4
+        let mut z3 = curve_a.mul(cs, &mut t4);
+
+        // x3 = b3 * z1
+        let mut x3 = curve_b3.mul(cs, z1);
+        // z3 = x3 + z3
+        let mut z3 = x3.add(cs, &mut z3);
+        // x3 = t1 - z3
+        let mut x3 = t1.sub(cs, &mut z3);
+
+        // z3 = t1 + z3
+        let mut z3 = t1.add(cs, &mut z3);
+        // y3 = x3 * z3
+        let mut y3 = x3.mul(cs, &mut z3);
+        // t1 = t0 + t0
+        let mut t1 = t0.double(cs);
+
+        // t1 = t1 + t0
+        let mut t1 = t1.add(cs, &mut t0);
+        // t2 = a * z1
+        let mut t2 = curve_a.mul(cs, z1);
+        // t4 = b3 * t4
+        let mut t4 = curve_b3.mul(cs, &mut t4);
+
+        // t1 = t1 + t2
+        let mut t1 = t1.add(cs, &mut t2);
+        // t2 = t0 - t2
+        let mut t2 = t0.sub(cs, &mut t2);
+        // t2 = a * t2
+        let mut t2 = curve_a.mul(cs, &mut t2);
+
+        // t4 = t4 + t2
+        let mut t4 = t4.add(cs, &mut t2);
+        // t0 = t1 * t4
+        let mut t0 = t1.mul(cs, &mut t4);
+        // y3 = y3 + t0
+        let y3 = y3.add(cs, &mut t0);
+
+        // t0 = t5 * t4
+        let mut t0 = t5.mul(cs, &mut t4);
+        // x3 = t3 * x3
+        let mut x3 = t3.mul(cs, &mut x3);
+        // x3 = x3 - t0
+        let x3 = x3.sub(cs, &mut t0);
+
+        // t0 = t3 * t1
+        let mut t0 = t3.mul(cs, &mut t1);
+        // z3 = t5 * z3
+        let mut z3 = t5.mul(cs, &mut z3);
+        // z3 = z3 + t0
+        let z3 = z3.add(cs, &mut t0);
+
+        let new = Self {
+            x: x3,
+            y: y3,
+            z: z3,
+            _marker: std::marker::PhantomData,
+        };
+
+        new
+    }
+
+    pub fn add_mixed<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        other_xy: &mut (NN, NN),
+    ) -> Self {
+        self.add_sub_mixed_impl(cs, other_xy, false)
+    }
+
+    pub fn sub_mixed<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        other_xy: &mut (NN, NN),
+    ) -> Self {
+        self.add_sub_mixed_impl(cs, other_xy, true)
+    }
+
+    pub fn convert_to_affine_or_default<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        default: C,
+    ) -> ((NN, NN), Boolean<F>) {
+        let params = self.x.get_params().clone();
+        let is_point_at_infty = NN::is_zero(&mut self.z, cs);
+
+        let one_nn = NN::allocated_constant(cs, T::one(), &params);
+        let mut safe_z = NN::conditionally_select(cs, is_point_at_infty, &one_nn, &self.z);
+        let x_for_safe_z = self.x.div_unchecked(cs, &mut safe_z);
+        let y_for_safe_z = self.y.div_unchecked(cs, &mut safe_z);
+
+        let (default_x, default_y) = default.into_xy_unchecked();
+
+        let default_x = NN::from_curve_base(cs, &default_x, &params);
+        let default_y = NN::from_curve_base(cs, &default_y, &params);
+
+        let x = NN::conditionally_select(cs, is_point_at_infty, &default_x, &x_for_safe_z);
+        let y = NN::conditionally_select(cs, is_point_at_infty, &default_y, &y_for_safe_z);
+
+        ((x, y), is_point_at_infty)
+    }
+
+    pub fn convert_to_affine_jacobian<CS: ConstraintSystem<F>>(
+        &mut self,
+        cs: &mut CS,
+        default: C,
+    ) -> ((NN, NN), Boolean<F>) {
+        let params = self.x.get_params().clone();
+        let is_point_at_infty = NN::is_zero(&mut self.z, cs);
+
+        let one_nn = NN::allocated_constant(cs, T::one(), &params);
+        let mut safe_z = NN::conditionally_select(cs, is_point_at_infty, &one_nn, &self.z);
+        let mut safe_z_squared = safe_z.square(cs);
+        safe_z_squared.normalize(cs);
+        let mut safe_z_cubed = safe_z.mul(cs, &mut safe_z_squared);
+        safe_z_cubed.normalize(cs);
+        let mut x_for_safe_z = self.x.div_unchecked(cs, &mut safe_z_squared);
+        x_for_safe_z.normalize(cs);
+        let mut y_for_safe_z = self.y.div_unchecked(cs, &mut safe_z_cubed);
+        y_for_safe_z.normalize(cs);
+
+        let (default_x, default_y) = default.into_xy_unchecked();
+
+        let default_x = NN::from_curve_base(cs, &default_x, &params);
+        let default_y = NN::from_curve_base(cs, &default_y, &params);
+
+        let x = NN::conditionally_select(cs, is_point_at_infty, &default_x, &x_for_safe_z);
+        let y = NN::conditionally_select(cs, is_point_at_infty, &default_y, &y_for_safe_z);
+
+        ((x, y), is_point_at_infty)
+    }
+
+    pub fn enforce_reduced<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) {
+        self.x.enforce_reduced(cs);
+        self.y.enforce_reduced(cs);
+        self.z.enforce_reduced(cs);
+    }
+}
+
+impl<F, T, C, NN> Selectable<F> for ExtendedSWProjectivePoint<F, T, C, NN>
+where
+    F: SmallField,
+    T: PrimeField,
+    C: GenericCurveAffine,
+    NN: CurveCompatibleNonNativeField<F, T, C>,
+{
+    const SUPPORTS_PARALLEL_SELECT: bool = false;
+
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let x = NN::conditionally_select(cs, flag, &a.x, &b.x);
+        let y = NN::conditionally_select(cs, flag, &a.y, &b.y);
+        let z = NN::conditionally_select(cs, flag, &a.z, &b.z);
+
+        Self {
+            x,
+            y,
+            z,
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
diff --git a/src/gadgets/curves/sw_projective/mod.rs b/src/gadgets/curves/sw_projective/mod.rs
index 50a4a1f..24cc7d7 100644
--- a/src/gadgets/curves/sw_projective/mod.rs
+++ b/src/gadgets/curves/sw_projective/mod.rs
@@ -1,3 +1,6 @@
+// Short weierstrass projective curve point implementation.
+// Primarily based on the paper: https://eprint.iacr.org/2015/1060.pdf
+
 use super::*;
 
 use crate::gadgets::traits::selectable::Selectable;
@@ -7,7 +10,7 @@ use crate::{
 };
 use pairing::GenericCurveAffine;
 
-// https://eprint.iacr.org/2015/1060.pdf
+pub mod extended;
 
 #[derive(Derivative)]
 #[derivative(Clone, Debug)]
@@ -55,6 +58,23 @@ where
         }
     }
 
+    pub fn one<CS: ConstraintSystem<F>>(cs: &mut CS, params: &std::sync::Arc<NN::Params>) -> Self {
+        use pairing::ff::Field;
+
+        let one = C::one();
+        let (x, y) = one.into_xy_unchecked();
+        let x = NN::allocated_constant(cs, x, params);
+        let y = NN::allocated_constant(cs, y, params);
+        let z = NN::allocated_constant(cs, C::Base::one(), params);
+
+        Self {
+            x,
+            y,
+            z,
+            _marker: std::marker::PhantomData,
+        }
+    }
+
     pub fn double<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
         use pairing::ff::Field;
         if C::a_coeff().is_zero() == false {
@@ -490,6 +510,12 @@ where
 
         ((x, y), is_point_at_infty)
     }
+
+    pub fn enforce_reduced<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) {
+        self.x.enforce_reduced(cs);
+        self.y.enforce_reduced(cs);
+        self.z.enforce_reduced(cs);
+    }
 }
 
 impl<F: SmallField, C: GenericCurveAffine, NN: NonNativeField<F, C::Base>> Selectable<F>
diff --git a/src/gadgets/curves/zeroable_affine/mod.rs b/src/gadgets/curves/zeroable_affine/mod.rs
index 3d002f4..0b57d84 100644
--- a/src/gadgets/curves/zeroable_affine/mod.rs
+++ b/src/gadgets/curves/zeroable_affine/mod.rs
@@ -1,73 +1,278 @@
-use std::sync::Arc;
-
-use pairing::GenericCurveAffine;
+use self::traits::selectable::Selectable;
 
+use super::*;
 use crate::{
     cs::traits::cs::ConstraintSystem,
     gadgets::{boolean::Boolean, non_native_field::traits::NonNativeField},
+    pairing::{
+        self,
+        ff::{Field, PrimeField},
+        GenericCurveAffine,
+    },
 };
+use std::{marker::PhantomData, sync::Arc};
 
-use super::*;
-
-pub struct ZeroableAffinePoint<F: SmallField, C: GenericCurveAffine, NN: NonNativeField<F, C::Base>>
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct ZeroableAffinePoint<F, GC, NF>
 where
-    C::Base: pairing::ff::PrimeField,
+    F: SmallField,
+    GC: GenericCurveAffine,
+    NF: NonNativeField<F, GC::Base>,
+    GC::Base: pairing::ff::PrimeField,
 {
-    pub x: NN,
-    pub y: NN,
-    pub is_zero: Boolean<F>,
-    pub _marker: std::marker::PhantomData<C>,
+    x: NF,
+    y: NF,
+    pub is_infinity: Boolean<F>,
+    pub _marker: PhantomData<GC>,
 }
 
-// we only need add/sub/double/negate Mul is implemented by naive double-and-add, and we can have special
-// mul that will multiply by an element of scalar field, where zeroness-exception can only happen once.
-
-// We also create decompress function for convenience
-
-impl<F: SmallField, C: GenericCurveAffine, NN: NonNativeField<F, C::Base>>
-    ZeroableAffinePoint<F, C, NN>
+impl<F, GC, NF> ZeroableAffinePoint<F, GC, NF>
 where
-    C::Base: pairing::ff::PrimeField,
+    F: SmallField,
+    GC: GenericCurveAffine,
+    NF: NonNativeField<F, GC::Base>,
+    GC::Base: PrimeField,
 {
-    pub fn zero_point<CS: ConstraintSystem<F>>(cs: &mut CS, params: &Arc<NN::Params>) -> Self {
-        use pairing::ff::Field;
-        let zero_nn = NN::allocated_constant(cs, C::Base::zero(), params);
-        let boolean_true = Boolean::allocated_constant(cs, true);
+    /// Initializes a new non-infinite affine point with the specified coordinates
+    fn new<CS>(cs: &mut CS, x: NF, y: NF) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        Self {
+            x,
+            y,
+            is_infinity: Boolean::allocated_constant(cs, false),
+            _marker: PhantomData,
+        }
+    }
+
+    /// Returns the x-coordinate of the point
+    fn x(&self) -> &NF {
+        &self.x
+    }
+
+    /// Returns the y-coordinate of the point
+    fn y(&self) -> &NF {
+        &self.y
+    }
+
+    /// Initializes the point at infinity. x and y are set to zero, and is_infinity is set to true.
+    fn zero_point<CS>(cs: &mut CS, params: &Arc<NF::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero_nf = NF::allocated_constant(cs, GC::Base::zero(), params);
 
         Self {
-            x: zero_nn.clone(),
-            y: zero_nn,
-            is_zero: boolean_true,
-            _marker: std::marker::PhantomData,
+            x: zero_nf.clone(),
+            y: zero_nf,
+            is_infinity: Boolean::allocated_constant(cs, true),
+            _marker: PhantomData,
+        }
+    }
+
+    /// Multiplies the affine point by a scalar using a basic brute log2(scalar) method.
+    fn mul<CS>(&mut self, cs: &mut CS, scalar: &GC::Base) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let params = self.x.get_params().clone();
+        let mut result = Self::zero_point(cs, &params);
+        let mut temp = self.clone();
+        let zero = Self::zero_point(cs, &params);
+
+        // Convert the scalar to bits
+        let scalar_bits = scalar
+            .into_repr()
+            .as_ref()
+            .iter()
+            .rev()
+            .flat_map(|byte| (0..8).rev().map(move |i| (byte >> i) & 1 == 1))
+            .collect::<Vec<_>>();
+
+        for bit in scalar_bits {
+            let bit_is_one = Boolean::allocated_constant(cs, bit);
+            let mut point_to_add = Self::conditionally_select(cs, bit_is_one, &temp, &zero);
+
+            result = result.add_unequal_x(cs, &mut point_to_add);
+            temp.double(cs);
         }
+
+        result
     }
 
-    pub fn same_x<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F> {
+    /// Doubling the point X (that is, finding 2X = X + X)
+    fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Validatoring that y1 is not zero
+        let is_zero = self.y.is_zero(cs);
+        let boolean_false = Boolean::allocated_constant(cs, false);
+        Boolean::enforce_equal(cs, &is_zero, &boolean_false);
+
+        // Algorithm for doubling a point (x1, y1):
+        // First, finding slope = (3 * x1^2 + a) / (2 * y1)
+        // Then, finding x3 = slope^2 - 2 * x1 and y3 = slope * (x1 - x3) - y1
+
+        // Getting parameter a
+        let params = self.x.get_params().clone();
+        let a = GC::a_coeff();
+        let mut a_nf = NF::allocated_constant(cs, a, &params);
+
+        // Calculating nominator
+        let mut nominator = self.x.clone().square(cs);
+        // Multiplying by 3
+        let mut initial_nominator = nominator.clone();
+        nominator = nominator.double(cs);
+        nominator = nominator.add(cs, &mut initial_nominator);
+        // Adding a
+        nominator = nominator.add(cs, &mut a_nf);
+
+        // Calculating denominator
+        let mut denominator = self.y.clone();
+        // Multiplying by 2
+        denominator = denominator.double(cs);
+
+        // Calculating slope
+        let mut slope = nominator.div_unchecked(cs, &mut denominator);
+
+        // Finding x3
+        let mut x = slope.clone().square(cs);
+        x = x.sub(cs, &mut self.x);
+        x = x.sub(cs, &mut self.x);
+
+        // Finding y3
+        let mut y = self.x.sub(cs, &mut x);
+        y = slope.mul(cs, &mut y);
+        y = y.sub(cs, &mut self.y);
+
+        self.x = x;
+        self.y = y;
+        Self {
+            x: self.x.clone(),
+            y: self.y.clone(),
+            is_infinity: self.is_infinity,
+            _marker: PhantomData,
+        }
+    }
+
+    /// Negates the point by negating the y coordinate
+    fn negate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.y = self.y.negated(cs);
+
+        Self {
+            x: self.x.clone(),
+            y: self.y.clone(),
+            is_infinity: self.is_infinity,
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<F, GC, NF> ZeroableAffinePoint<F, GC, NF>
+where
+    F: SmallField,
+    GC: GenericCurveAffine,
+    NF: NonNativeField<F, GC::Base>,
+    GC::Base: PrimeField,
+{
+    /// Returns a boolean that is true if the x coordinates of the two points are equal.
+    pub fn same_x<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
         self.x.equals(cs, &mut other.x)
     }
 
-    pub fn same_y<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F> {
+    /// Returns a boolean that is true if the y coordinates of the two points are equal.
+    pub fn same_y<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
         self.y.equals(cs, &mut other.y)
     }
 
-    #[allow(unused_assignments)]
-    pub fn add_unequal<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS, other: &mut Self) -> Self {
+    /// Adds two affine points elementwise.
+    pub fn elementwise_add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.x = self.x.add(cs, &mut other.x);
+        self.y = self.y.add(cs, &mut other.y);
+        Self {
+            x: self.x.clone(),
+            y: self.y.clone(),
+            is_infinity: self.is_infinity,
+            _marker: PhantomData,
+        }
+    }
+
+    /// Adds two points with unequal x coordinates. If the x coordinates are equal, the result is undefined
+    /// and therefore the panic is raised.
+    pub fn add_unequal_x<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Verify that the x coordinates are not equal
         let same_x = self.same_x(cs, other);
         let boolean_false = Boolean::allocated_constant(cs, false);
         Boolean::enforce_equal(cs, &same_x, &boolean_false);
 
-        let mut divisor = self.x.sub(cs, &mut other.x);
-        let mut numerator = self.y.sub(cs, &mut other.y);
-        let mut slope = numerator.div_unchecked(cs, &mut divisor);
-        let mut x2 = slope.clone();
-        x2 = x2.mul(cs, &mut slope);
-        let mut tmp = self.x.add(cs, &mut other.x);
-        x2 = x2.sub(cs, &mut tmp);
+        // Algorithm for having two points (x1, y1) and (x2, y2) and adding them together:
+        // First, finding slope = (y2 - y1) / (x2 - x1)
+        // Then, finding x3 = slope^2 - x1 - x2 and y3 = slope * (x1 - x3) - y1
+        let mut dx = self.x.sub(cs, &mut other.x);
+        let mut dy = self.y.sub(cs, &mut other.y);
+        // slope = dy / dx and we do not care whether dx is zero or not since we have already checked that
+        let mut slope = dy.div_unchecked(cs, &mut dx);
+
+        // x3 = slope^2 - x1 - x2
+        let mut x = slope.clone().square(cs);
+        x = x.sub(cs, &mut self.x);
+        x = x.sub(cs, &mut other.x);
 
-        let mut tmp = self.x.sub(cs, &mut x2);
-        let mut y2 = slope.mul(cs, &mut tmp);
-        y2 = y2.add(cs, &mut self.y);
+        // y3 = slope * (x1 - x3) - y1
+        let mut y = self.x.sub(cs, &mut x);
+        y = slope.mul(cs, &mut y);
+        y = y.sub(cs, &mut self.y);
+
+        self.x = x;
+        self.y = y;
+        Self {
+            x: self.x.clone(),
+            y: self.y.clone(),
+            is_infinity: self.is_infinity,
+            _marker: PhantomData,
+        }
+    }
+}
 
-        todo!()
+impl<F: SmallField, C: GenericCurveAffine, NN: NonNativeField<F, C::Base>> Selectable<F>
+    for ZeroableAffinePoint<F, C, NN>
+where
+    C::Base: pairing::ff::PrimeField,
+{
+    const SUPPORTS_PARALLEL_SELECT: bool = false;
+
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let x = NN::conditionally_select(cs, flag, &a.x, &b.x);
+        let y = NN::conditionally_select(cs, flag, &a.y, &b.y);
+        let is_infinity = Boolean::conditionally_select(cs, flag, &a.is_infinity, &b.is_infinity);
+
+        Self {
+            x,
+            y,
+            is_infinity,
+            _marker: std::marker::PhantomData,
+        }
     }
 }
diff --git a/src/gadgets/mod.rs b/src/gadgets/mod.rs
index 7c3b400..992853e 100644
--- a/src/gadgets/mod.rs
+++ b/src/gadgets/mod.rs
@@ -16,10 +16,14 @@ pub mod recursion;
 pub mod round_function;
 pub mod sha256;
 pub mod tables;
+pub mod tower_extension;
 pub mod traits;
+pub mod u1024;
 pub mod u16;
 pub mod u160;
+pub mod u2048;
 pub mod u256;
 pub mod u32;
+pub mod u4096;
 pub mod u512;
 pub mod u8;
diff --git a/src/gadgets/non_native_field/implementations/impl_traits.rs b/src/gadgets/non_native_field/implementations/impl_traits.rs
index 51ba091..2917f9a 100644
--- a/src/gadgets/non_native_field/implementations/impl_traits.rs
+++ b/src/gadgets/non_native_field/implementations/impl_traits.rs
@@ -42,6 +42,9 @@ where
     fn enforce_reduced<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) {
         NonNativeFieldOverU16::<F, T, N>::enforce_reduced(self, cs)
     }
+    fn enforce_equal<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self) {
+        NonNativeFieldOverU16::<F, T, N>::enforce_equal(cs, a, b)
+    }
     fn normalize<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) {
         NonNativeFieldOverU16::<F, T, N>::normalize(self, cs)
     }
@@ -94,12 +97,12 @@ where
         NonNativeFieldOverU16::<F, T, N>::div_unchecked(self, cs, other)
     }
     #[must_use]
-    fn allocate_inverse_or_zero<CS: ConstraintSystem<F>>(&self, _cs: &mut CS) -> Self {
-        todo!()
+    fn allocate_inverse_or_zero<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        NonNativeFieldOverU16::<F, T, N>::allocate_inverse_or_zero(&self, cs)
     }
     #[must_use]
-    fn inverse_unchecked<CS: ConstraintSystem<F>>(&mut self, _cs: &mut CS) -> Self {
-        todo!()
+    fn inverse_unchecked<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
+        NonNativeFieldOverU16::<F, T, N>::inverse_unchecked(self, cs)
     }
     #[must_use]
     fn is_zero<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Boolean<F> {
diff --git a/src/gadgets/non_native_field/implementations/implementation_u16.rs b/src/gadgets/non_native_field/implementations/implementation_u16.rs
index f29509b..3edcb63 100644
--- a/src/gadgets/non_native_field/implementations/implementation_u16.rs
+++ b/src/gadgets/non_native_field/implementations/implementation_u16.rs
@@ -1,4 +1,7 @@
 use crypto_bigint::CheckedMul;
+use serde::de::Visitor;
+use serde::{de, Deserialize, Deserializer, Serialize};
+use std::fmt;
 
 use crate::cs::gates::{
     ConstantAllocatableCS, DotProductGate, FmaGateInBaseFieldWithoutConstant, UIntXAddGate,
@@ -6,7 +9,7 @@ use crate::cs::gates::{
 use crate::cs::traits::cs::DstBuffer;
 use crate::gadgets::boolean::Boolean;
 use crate::gadgets::num::Num;
-use crate::gadgets::traits::allocatable::CSAllocatable;
+use crate::gadgets::traits::allocatable::{CSAllocatable, CSPlaceholder};
 use crate::gadgets::traits::castable::WitnessCastable;
 use crate::gadgets::traits::selectable::Selectable;
 use crate::gadgets::traits::witnessable::{CSWitnessable, WitnessHookable};
@@ -126,6 +129,25 @@ where
         self.tracker.max_moduluses = 1;
     }
 
+    pub fn enforce_equal<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self) {
+        let mut a = a.clone();
+        let mut b = b.clone();
+
+        a.normalize(cs);
+        b.normalize(cs);
+
+        if <CS::Config as CSConfig>::DebugConfig::PERFORM_RUNTIME_ASSERTS {
+            assert_eq!(
+                a.non_zero_limbs, b.non_zero_limbs,
+                "enforce equal failed: non_zero_limbs divergence"
+            );
+        }
+        
+        for (a_el, b_el) in a.limbs.iter().zip(b.limbs.iter()) {
+            Num::enforce_equal(cs, &Num::from_variable(*a_el), &Num::from_variable(*b_el));
+        }
+    }
+
     pub fn normalize<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS)
     where
         [(); N + 1]:,
@@ -258,6 +280,7 @@ where
             }
         }
 
+        assert!(self.tracker.max_moduluses <= self.params.max_mods_to_fit);
         new
     }
 
@@ -334,6 +357,7 @@ where
             }
         }
 
+        assert!(self.tracker.max_moduluses <= self.params.max_mods_to_fit);
         new
     }
 
@@ -713,6 +737,7 @@ where
         // enforce that r is canonical
         new.enforce_reduced(cs);
 
+        assert!(self.tracker.max_moduluses <= self.params.max_mods_to_fit);
         new
     }
 
@@ -764,7 +789,9 @@ where
             let new = Self {
                 limbs,
                 non_zero_limbs: used_words,
-                tracker: OverflowTracker { max_moduluses: 2 }, // NOTE: if self == 0, then limbs will be == modulus, so use 2
+                tracker: OverflowTracker {
+                    max_moduluses: std::cmp::max(2, self.tracker.max_moduluses),
+                }, // NOTE: if self == 0, then limbs will be == modulus, so use 2
                 form: RepresentationForm::Normalized,
                 params: self.params.clone(),
                 _marker: std::marker::PhantomData,
@@ -858,6 +885,7 @@ where
             }
         }
 
+        assert!(self.tracker.max_moduluses <= self.params.max_mods_to_fit);
         new
     }
 
@@ -1032,14 +1060,106 @@ impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> CSAllocatable<F>
     }
 }
 
+impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> CSPlaceholder<F>
+    for NonNativeFieldOverU16<F, T, N>
+{
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let variable = Variable::placeholder();
+
+        Self {
+            limbs: [variable; N],
+            non_zero_limbs: 0,
+            tracker: OverflowTracker { max_moduluses: 0 },
+            form: RepresentationForm::Normalized,
+            params: Arc::new(NonNativeFieldOverU16Params::placeholder(cs)),
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> CircuitVarLengthEncodable<F>
+    for NonNativeFieldOverU16<F, T, N>
+{
+    fn encoding_length(&self) -> usize {
+        N
+    }
+
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, _cs: &mut CS, dst: &mut Vec<Variable>) {
+        dst.extend_from_slice(self.limbs.as_slice())
+    }
+}
+
 // We need this to ensure no conflicting implementations without negative impls
 
-#[derive(Derivative)]
-#[derivative(Clone, Copy, Debug, Hash)]
+#[derive(Derivative, Serialize, PartialEq)]
+#[derivative(Clone, Copy, Debug, Hash, Eq)]
 pub struct FFProxyValue<T: pairing::ff::PrimeField, const N: usize> {
     value: T,
 }
 
+// Implement custom Deserialize, because we cannot derive:
+// PrimeField inherits only DeserializeOwned.
+impl<'de, T, const N: usize> Deserialize<'de> for FFProxyValue<T, N>
+where
+    T: pairing::ff::PrimeField,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct FFProxyValueVisitor<T, const N: usize>
+        where
+            T: pairing::ff::PrimeField,
+        {
+            marker: std::marker::PhantomData<T>,
+        }
+
+        impl<'de, T, const N: usize> Visitor<'de> for FFProxyValueVisitor<T, N>
+        where
+            T: pairing::ff::PrimeField + serde::de::DeserializeOwned,
+        {
+            type Value = FFProxyValue<T, N>;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("a valid PrimeField value")
+            }
+
+            fn visit_map<M>(self, mut map: M) -> Result<Self::Value, M::Error>
+            where
+                M: de::MapAccess<'de>,
+            {
+                let mut value = None;
+
+                while let Some(key) = map.next_key()? {
+                    match key {
+                        "value" => {
+                            if value.is_some() {
+                                return Err(de::Error::duplicate_field("value"));
+                            }
+                            value = Some(map.next_value()?);
+                        }
+                        _ => {
+                            return Err(de::Error::unknown_field(key, FIELDS));
+                        }
+                    }
+                }
+
+                let value = value.ok_or_else(|| de::Error::missing_field("value"))?;
+                Ok(FFProxyValue { value })
+            }
+        }
+
+        const FIELDS: &[&str] = &["value"];
+        deserializer.deserialize_struct(
+            "FFProxyValue",
+            FIELDS,
+            FFProxyValueVisitor {
+                marker: std::marker::PhantomData,
+            },
+        )
+    }
+}
+
 impl<T: pairing::ff::PrimeField, const N: usize> FFProxyValue<T, N> {
     pub const fn get(&self) -> T {
         self.value
@@ -1080,6 +1200,7 @@ impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> WitnessCastable<
 }
 
 use crate::gadgets::traits::castable::Convertor;
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
 
 impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> CSWitnessable<F, N>
     for NonNativeFieldOverU16<F, T, N>
diff --git a/src/gadgets/non_native_field/implementations/mod.rs b/src/gadgets/non_native_field/implementations/mod.rs
index 9c38036..16a322f 100644
--- a/src/gadgets/non_native_field/implementations/mod.rs
+++ b/src/gadgets/non_native_field/implementations/mod.rs
@@ -6,6 +6,7 @@ use super::*;
 use crate::config::*;
 use crate::cs::gates::ConstantAllocatableCS;
 use crate::cs::traits::cs::ConstraintSystem;
+use crate::gadgets::traits::allocatable::CSPlaceholder;
 use crate::gadgets::u16::UInt16;
 use crate::{cs::Variable, gadgets::u8::get_8_by_8_range_check_table};
 use crypto_bigint::{CheckedMul, NonZero, Zero, U1024};
@@ -177,6 +178,24 @@ impl<T: pairing::ff::PrimeField, const N: usize> NonNativeFieldOverU16Params<T,
     }
 }
 
+impl<F: SmallField, T: pairing::ff::PrimeField, const N: usize> CSPlaceholder<F>
+    for NonNativeFieldOverU16Params<T, N>
+{
+    fn placeholder<CS: ConstraintSystem<F>>(_cs: &mut CS) -> Self {
+        Self {
+            modulus: [0u16; N],
+            modulus_bits: 0,
+            modulus_limbs: 0,
+            modulus_u1024: NonZero::<U1024>::new(U1024::ONE).expect("ONE is non-zero"),
+            max_product_before_reduction: U1024::ZERO,
+            max_mods_to_fit: 0,
+            max_mods_in_allocation: 0,
+            max_mods_before_multiplication: 0,
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
+
 #[derive(Derivative)]
 #[derivative(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum RepresentationForm {
diff --git a/src/gadgets/non_native_field/traits/mod.rs b/src/gadgets/non_native_field/traits/mod.rs
index b4cfc94..fb751d9 100644
--- a/src/gadgets/non_native_field/traits/mod.rs
+++ b/src/gadgets/non_native_field/traits/mod.rs
@@ -1,6 +1,9 @@
 use super::*;
+
 use crate::gadgets::boolean::Boolean;
 use crate::{cs::traits::cs::ConstraintSystem, gadgets::traits::witnessable::WitnessHookable};
+
+use pairing::GenericCurveAffine;
 use std::sync::Arc;
 
 pub trait NonNativeField<F: SmallField, T: pairing::ff::PrimeField>:
@@ -26,6 +29,7 @@ pub trait NonNativeField<F: SmallField, T: pairing::ff::PrimeField>:
     ) -> Self;
 
     fn enforce_reduced<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS);
+    fn enforce_equal<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self);
     fn normalize<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS);
 
     fn add<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS, other: &mut Self) -> Self;
@@ -79,3 +83,16 @@ pub trait NonNativeField<F: SmallField, T: pairing::ff::PrimeField>:
         b: &Self,
     ) -> Self;
 }
+
+pub trait CurveCompatibleNonNativeField<
+    F: SmallField,
+    T: pairing::ff::PrimeField,
+    C: GenericCurveAffine,
+>: NonNativeField<F, T>
+{
+    fn from_curve_base<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        point: &C::Base,
+        params: &Arc<Self::Params>,
+    ) -> Self;
+}
diff --git a/src/gadgets/tower_extension/algebraic_torus.rs b/src/gadgets/tower_extension/algebraic_torus.rs
new file mode 100644
index 0000000..bf30501
--- /dev/null
+++ b/src/gadgets/tower_extension/algebraic_torus.rs
@@ -0,0 +1,480 @@
+use pairing::{ff::PrimeField, BitIterator};
+use std::sync::Arc;
+
+use super::{fq12::Fq12, fq2::Fq2, fq6::Fq6, params::TorusExtension12Params};
+use crate::gadgets::non_native_field::implementations::NonNativeFieldOverU16;
+use crate::gadgets::tower_extension::params::{Extension2Params, Extension6Params};
+use crate::gadgets::traits::witnessable::WitnessHookable;
+use crate::{
+    cs::traits::cs::ConstraintSystem,
+    field::SmallField,
+    gadgets::{
+        boolean::Boolean,
+        non_native_field::traits::NonNativeField,
+        traits::{hardexp_compatible::HardexpCompatible, selectable::Selectable},
+    },
+};
+
+/// [`TorusWrapper`] is an algebraic compression of the `Fq12` element via underlying encoding of `Fq6`.
+/// In compressed form operations over Fq12 are less expensive.
+///
+/// The implementation is based on the following paper:
+/// https://eprint.iacr.org/2022/1162.pdf.
+#[derive(Clone, Debug, Copy)]
+pub struct TorusWrapper<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: TorusExtension12Params<T>,
+{
+    pub encoding: Fq6<F, T, NN, P::Ex6>,
+}
+
+// TODO: Probably, this could be implemented generally for any two Fqk and Fq(k/2) elements.
+impl<F, T, P, const N: usize> TorusWrapper<F, T, NonNativeFieldOverU16<F, T, N>, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    P: TorusExtension12Params<T>,
+    [(); N + 1]:,
+{
+    /// Creates a new instance of the [`TorusWrapper`] with the given encoding.
+    pub fn new(encoding: Fq6<F, T, NonNativeFieldOverU16<F, T, N>, P::Ex6>) -> Self {
+        Self { encoding }
+    }
+
+    pub fn one<CS>(
+        cs: &mut CS,
+        params: &Arc<<NonNativeFieldOverU16<F, T, N> as NonNativeField<F, T>>::Params>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let encoding = Fq6::zero(cs, params);
+        Self::new(encoding)
+    }
+
+    /// Returns the underlying parameters of the encoded `Fq6` element.
+    pub fn get_params(
+        &self,
+    ) -> &Arc<<NonNativeFieldOverU16<F, T, N> as NonNativeField<F, T>>::Params> {
+        self.encoding.get_params()
+    }
+
+    /// Normalizes the encoding of the `Fq6` element.
+    pub fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.encoding.normalize(cs);
+    }
+
+    /// Returns an instance if `flag` is `true`, otherwise returns a zero element.
+    pub fn mask<CS>(&mut self, cs: &mut CS, flag: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = Fq6::zero(cs, self.get_params());
+        let new_encoding =
+            <Fq6<F, T, NonNativeFieldOverU16<F, T, N>, P::Ex6>>::conditionally_select(
+                cs,
+                flag,
+                &self.encoding,
+                &zero,
+            );
+
+        Self::new(new_encoding)
+    }
+
+    /// Compresses the `Fq12` element `c0 + c1*w` to the Torus (`T2`) element.
+    ///
+    /// Uses the formula `m <- (1 + c0) / c1` to compress the `Fq12` element with the additional
+    /// check for the exceptional case when `c1` is zero.
+    ///
+    /// If `SAFE=false`, then the function will not check for the exceptional case when `c1` is zero.
+    pub fn compress<CS, const SAFE: bool>(
+        cs: &mut CS,
+        f: &mut Fq12<F, T, NonNativeFieldOverU16<F, T, N>, P>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let params = f.get_params();
+        let mut c0 = f.c0.clone();
+        let mut c1 = f.c1.clone();
+
+        let mut encoding = if SAFE {
+            // Preparing flags for exception cases
+            let is_exceptional = Fq6::is_zero(&mut c1, cs);
+            let mut c0_is_one = Fq6::one(cs, params);
+            let c0_is_one = c0_is_one.equals(cs, &mut c0);
+            let mut is_exceptional = Fq6::from_boolean(cs, is_exceptional, params);
+            let mut c0_is_one = Fq6::from_boolean(cs, c0_is_one, params);
+
+            // m <- (1 + c0) / c1 if c1 is non-zero. However, to account for the case where
+            // c1 is zero, we set numerator to 1 + c0 - 2*c0_is_one and denominator to c1 + is_exceptional.
+            let mut numerator = Fq6::one(cs, params);
+            let mut numerator = numerator.add(cs, &mut c0);
+            let mut c0_is_one_doubled = c0_is_one.double(cs);
+            let mut numerator = numerator.sub(cs, &mut c0_is_one_doubled);
+            let mut denominator = f.c1.add(cs, &mut is_exceptional);
+            denominator.normalize(cs);
+
+            let encoding = numerator.div(cs, &mut denominator);
+            encoding
+        } else {
+            // Verifying that c1 is non-zero
+            let boolean_false = Boolean::allocated_constant(cs, false);
+            let c1_is_zero = c1.is_zero(cs);
+            Boolean::enforce_equal(cs, &c1_is_zero, &boolean_false);
+
+            // m <- (1 + c0) / c1
+            let mut encoding = Fq6::one(cs, params);
+            let mut encoding = encoding.add(cs, &mut f.c0);
+            let encoding = encoding.div(cs, &mut f.c1);
+
+            encoding
+        };
+
+        encoding.normalize(cs);
+        Self::new(encoding)
+    }
+
+    /// Decompresses the Torus (`T2`) element `g` back to the `Fq12` element by using the formula
+    ///
+    /// `zeta^{-1} = (g + w)/(g - w)`
+    pub fn decompress<CS>(&self, cs: &mut CS) -> Fq12<F, T, NonNativeFieldOverU16<F, T, N>, P>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let params = self.get_params();
+        let mut one = Fq6::one(cs, params);
+        let negative_one = one.negated(cs);
+
+        // Since `g` is a pure `Fq6` element, `g+w` is just an `Fq12` element with `c0 = g` and `c1 = 1`.
+        let mut numerator = Fq12::new(self.encoding.clone(), one);
+        // Since `g` is a pure `Fq6` element, `g-w` is just an `Fq12` element with `c0 = g` and `c1 = -1`.
+        let mut denominator = Fq12::new(self.encoding.clone(), negative_one);
+
+        // zeta^{-1} = (g + w)/(g - w)
+        let decompressed = numerator.div(cs, &mut denominator);
+
+        decompressed
+    }
+
+    /// Computes the inverse of the Torus element using the formula g -> -g.
+    pub fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let encoding = self.encoding.negated(cs);
+        Self::new(encoding)
+    }
+
+    /// Computes the conjugate of the Torus element using the formula g -> -g.
+    /// Note that the conjugate of the Torus element is the same as its inverse.
+    pub fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    /// Computes the Frobenius map of the Torus element with the given power using the formula
+    ///
+    /// frob_map(g, i) = g^(p^i) / \gamma^{(p^i-1)/2}
+    pub fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // We compute frobenius map unconstrained:
+        let witness_self = self.encoding_to_witness(cs);
+        let witness_frob = P::torus_frobenius_map(witness_self, power);
+
+        // Now, we constraint the frobenius map with a cheaper version:
+        // Suppose r = f(g,i) / (f(w,i) * w^{-1}). Then, we require:
+        // f(g, i) = f(w, i) * (w^{-1}) * r
+        // Notice that `f(w,i)*w^{-1}` must yield an element
+        // from Fq6. Thus, we need one frobenius map + mul over Fq6, and
+        // one frobenius map + mul over Fq12.
+        let params = self.encoding.get_params();
+        let mut encoding_new = Fq6::allocate_from_witness(cs, witness_frob, params);
+
+        // rhs = f(w, i) * (w^{-1}) * r
+        // First, allocating the w^{-1}
+        let w_inverse = P::get_w_inverse_coeffs_c5();
+        let mut w_inverse: Fq2<_, _, _, <P::Ex6 as Extension6Params<T>>::Ex2> = Fq2::constant(cs, w_inverse, params);
+
+        let mut rhs: Fq12<F, T, NonNativeFieldOverU16<F, T, N>, P> = Fq12::one_imaginary(cs, params);
+        rhs = rhs.frobenius_map(cs, power);
+        rhs = rhs.mul_by_c5(cs, &mut w_inverse);
+
+        // Asserting that c1 is zero since rhs must be a pure Fq6 element at this point.
+        let boolean_true = Boolean::allocated_constant(cs, true);
+        let c1_is_zero = rhs.c1.is_zero(cs);
+        Boolean::enforce_equal(cs, &c1_is_zero, &boolean_true);
+        let mut rhs = rhs.c0.clone();
+        
+        // Finishing rhs by multiplying by result
+        rhs = rhs.mul(cs, &mut encoding_new);
+
+        // lhs = f(g, i)
+        let mut lhs = self.encoding.clone();
+        lhs = lhs.frobenius_map(cs, power);
+
+        // Asserting that lhs == rhs
+        Fq6::enforce_equal(cs, &lhs, &rhs);
+
+        Self::new(encoding_new)
+    }
+
+    /// Computes the product of two Torus elements using the formula
+    ///
+    /// `(g, g') -> (g * g' + \gamma) / (g + g')`
+    ///
+    /// The formula handles the exceptional case when `g + g'` is zero.
+    pub fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // We compute multiplication unconstrained:
+        let witness_self = self.encoding_to_witness(cs);
+        let witness_other = other.encoding_to_witness(cs);
+        let witness_mul = P::torus_mul(witness_self, witness_other);
+
+        // Now, we constraint the multiplication with a cheaper version:
+        // g'' = (g * g' + \gamma) / (g + g') is equivalent to
+        // g'' * (g + g') = (g * g' + \gamma)
+        // Here, g'' is the new encoding.
+        let params = self.encoding.get_params();
+        let encoding_new = Fq6::allocate_from_witness(cs, witness_mul, params);
+
+        // lhs = g'' * (g + g')
+        let mut sum = self.encoding.clone().add(cs, &mut other.encoding);
+        let lhs = encoding_new.clone().mul(cs, &mut sum);
+
+        // rhs = {(g + g') == 0} ? zero : (g * g' + \gamma)
+        let mut gamma = Fq6::gamma(cs, params);
+        let mut rhs = self.encoding.clone().mul(cs, &mut other.encoding);
+        let rhs = rhs.add(cs, &mut gamma);
+
+        let zero = Fq6::zero(cs, params);
+        let is_zero_sum = sum.is_zero(cs);
+
+        let rhs = <Fq6<F, T, NonNativeFieldOverU16<F, T, N>, P::Ex6>>::conditionally_select(
+            cs,
+            is_zero_sum,
+            &zero,
+            &rhs,
+        );
+
+        // Enforce equality
+        Fq6::enforce_equal(cs, &lhs, &rhs);
+
+        Self::new(encoding_new)
+    }
+
+    pub fn pow_naf_decomposition<CS, S: AsRef<[i8]>>(
+        &mut self,
+        cs: &mut CS,
+        decomposition: S,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Intializing the result with 1
+        let mut result = Self::one(cs, self.get_params());
+
+        // Preparing self and self inverse in advance
+        let mut self_cloned = self.clone();
+        let mut self_inverse = self.conjugate(cs);
+
+        for bit in decomposition.as_ref().iter() {
+            result = result.square(cs);
+
+            // If bit is 1, multiply by initial torus
+            let bit_is_one = Boolean::allocated_constant(cs, *bit == 1);
+            let result_times_self = result.mul(cs, &mut self_cloned);
+            result = Self::conditionally_select(cs, bit_is_one, &result_times_self, &result);
+
+            // If bit is -1, multiply by inverse initial torus
+            let bit_is_minus_one = Boolean::allocated_constant(cs, *bit == -1);
+            let result_times_self_inverse = result.mul(cs, &mut self_inverse);
+            result = Self::conditionally_select(
+                cs,
+                bit_is_minus_one,
+                &result_times_self_inverse,
+                &result,
+            );
+        }
+
+        result
+    }
+
+    pub fn pow_u32<CS, S: AsRef<[u64]>>(&mut self, cs: &mut CS, exponent: S) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut result = Self::one(cs, self.get_params());
+        let mut found_one = false;
+
+        for bit in BitIterator::new(exponent) {
+            let apply_squaring = Boolean::allocated_constant(cs, found_one);
+            let result_squared = result.square(cs);
+            result = Self::conditionally_select(cs, apply_squaring, &result_squared, &result);
+            if !found_one {
+                found_one = bit;
+            }
+
+            let result_multiplied = result.mul(cs, self);
+            let apply_multiplication = Boolean::allocated_constant(cs, bit);
+            result =
+                Self::conditionally_select(cs, apply_multiplication, &result_multiplied, &result);
+
+            result.normalize(cs);
+        }
+
+        result
+    }
+
+    pub fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // We compute squaring unconstrained:
+        let witness = self.encoding_to_witness(cs);
+        let witness_squared = P::torus_square(witness);
+
+        // Now, we constraint squaring with a cheaper version:
+        // g' = (1/2)(g + \gamma/g) is equivalent to
+        // (2g' - g)*g = gamma
+        let params = self.encoding.get_params();
+        let encoding_new = Fq6::allocate_from_witness(cs, witness_squared, params);
+
+        // lhs = (2g' - g)*g
+        let mut lhs = encoding_new.clone();
+        lhs = lhs.double(cs);
+        lhs = lhs.sub(cs, &mut self.encoding.clone());
+        let lhs = self.encoding.clone().mul(cs, &mut lhs);
+
+        // rhs = g == 0 ? zero : gamma
+        let zero = Fq6::zero(cs, params);
+        let gamma = Fq6::gamma(cs, params);
+        let is_zero_g = self.encoding.is_zero(cs);
+        let rhs = <Fq6<F, T, NonNativeFieldOverU16<F, T, N>, P::Ex6>>::conditionally_select(
+            cs, is_zero_g, &zero, &gamma,
+        );
+
+        // We can just enforce equality without subbing
+        Fq6::enforce_equal(cs, &lhs, &rhs);
+        Self::new(encoding_new)
+    }
+
+    // TODO: Probably, this can be done less weirdly.
+    /// Converts the encoding of the `Fq6` element to the structured witness.
+    pub(super) fn encoding_to_witness<CS>(
+        &self,
+        cs: &mut CS,
+    ) -> <P::Ex6 as Extension6Params<T>>::Witness
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1, c2) = self.encoding.witness_hook(cs)().unwrap();
+
+        let (c0_c0, c0_c1) = c0;
+        let (c1_c0, c1_c1) = c1;
+        let (c2_c0, c2_c1) = c2;
+
+        let (c0_c0, c0_c1) = (c0_c0.get(), c0_c1.get());
+        let (c1_c0, c1_c1) = (c1_c0.get(), c1_c1.get());
+        let (c2_c0, c2_c1) = (c2_c0.get(), c2_c1.get());
+
+        let c0 = <P::Ex6 as Extension6Params<T>>::Ex2::convert_to_structured_witness(c0_c0, c0_c1);
+        let c1 = <P::Ex6 as Extension6Params<T>>::Ex2::convert_to_structured_witness(c1_c0, c1_c1);
+        let c2 = <P::Ex6 as Extension6Params<T>>::Ex2::convert_to_structured_witness(c2_c0, c2_c1);
+
+        P::Ex6::convert_to_structured_witness(c0, c1, c2)
+    }
+}
+
+impl<F, T, P, const N: usize> Selectable<F>
+    for TorusWrapper<F, T, NonNativeFieldOverU16<F, T, N>, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    P: TorusExtension12Params<T>,
+    [(); N + 1]:,
+{
+    fn conditionally_select<CS>(cs: &mut CS, flag: Boolean<F>, a: &Self, b: &Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let encoding = <Fq6<F, T, NonNativeFieldOverU16<F, T, N>, P::Ex6>>::conditionally_select(
+            cs,
+            flag,
+            &a.encoding,
+            &b.encoding,
+        );
+
+        Self::new(encoding)
+    }
+}
+
+impl<F, T, const N: usize, P> HardexpCompatible<F>
+    for TorusWrapper<F, T, NonNativeFieldOverU16<F, T, N>, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    P: TorusExtension12Params<T>,
+    [(); N + 1]:,
+{
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.mul(cs, other)
+    }
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.square(cs)
+    }
+
+    fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.conjugate(cs)
+    }
+
+    fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.frobenius_map(cs, power)
+    }
+
+    fn pow_u32<CS, S: AsRef<[u64]>>(&mut self, cs: &mut CS, exponent: S) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.pow_u32(cs, exponent)
+    }
+
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.normalize(cs);
+    }
+}
diff --git a/src/gadgets/tower_extension/fq12.rs b/src/gadgets/tower_extension/fq12.rs
new file mode 100644
index 0000000..bc9f68a
--- /dev/null
+++ b/src/gadgets/tower_extension/fq12.rs
@@ -0,0 +1,819 @@
+use std::sync::Arc;
+
+use pairing::{bn256::Fq as BN256Fq, ff::PrimeField, BitIterator};
+
+use super::{
+    fq2::Fq2,
+    fq6::Fq6,
+    params::{
+        bn256::{BN256Extension12Params, BN256Extension6Params},
+        Extension12Params, Extension6Params,
+    },
+};
+
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+use crate::{
+    cs::traits::cs::ConstraintSystem,
+    field::SmallField,
+    gadgets::{
+        boolean::Boolean,
+        non_native_field::traits::NonNativeField,
+        traits::{
+            allocatable::CSAllocatable, selectable::Selectable, witnessable::WitnessHookable,
+        },
+    },
+};
+use crate::{cs::Variable, gadgets::traits::hardexp_compatible::HardexpCompatible};
+
+/// `Fq12` field extension implementation in the constraint system. It is implemented
+/// as `Fq6[w]/(w^2-v)` where `w^6=9+u`. In other words, it is a set of
+/// linear polynomials in a form `c0+c1*w`, where `c0` and `c1` are elements of `Fq6`.
+/// See https://hackmd.io/@jpw/bn254#Field-extension-towers for reference. For
+/// implementation reference, see https://eprint.iacr.org/2006/471.pdf.
+#[derive(Clone, Debug, Copy)]
+pub struct Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    pub c0: Fq6<F, T, NN, P::Ex6>,
+    pub c1: Fq6<F, T, NN, P::Ex6>,
+    _marker: std::marker::PhantomData<(F, T)>,
+}
+
+impl<F, T, NN, P> Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    /// Creates a new `Fq12` element from two `Fq6` components.
+    pub fn new(c0: Fq6<F, T, NN, P::Ex6>, c1: Fq6<F, T, NN, P::Ex6>) -> Self {
+        Self {
+            c0,
+            c1,
+            _marker: std::marker::PhantomData::<(F, T)>,
+        }
+    }
+
+    pub fn from_c0c3c4<CS>(
+        cs: &mut CS,
+        c0: Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c3: Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c4: Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = Fq2::zero(cs, c0.c0.get_params());
+        let c0 = Fq6::new(c0.clone(), zero.clone(), zero.clone());
+        let c1 = Fq6::new(c3.clone(), c4.clone(), zero);
+
+        Self::new(c0, c1)
+    }
+
+    pub fn pow_u32<CS, S: AsRef<[u64]>>(&mut self, cs: &mut CS, exponent: S) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut result = Self::one(cs, self.c0.c0.get_params());
+        let mut found_one = false;
+
+        for i in BitIterator::new(exponent) {
+            let apply_squaring = Boolean::allocated_constant(cs, found_one);
+            let result_squared = result.square(cs);
+            result = Self::conditionally_select(cs, apply_squaring, &result_squared, &result);
+            if !found_one {
+                found_one = i;
+            }
+
+            let result_multiplied = result.mul(cs, self);
+            let apply_multiplication = Boolean::allocated_constant(cs, i);
+            result =
+                Self::conditionally_select(cs, apply_multiplication, &result_multiplied, &result);
+
+            // Normalize the result to stay in field
+            NonNativeField::normalize(&mut result, cs);
+        }
+
+        result
+    }
+
+    /// Creates a new zero `Fq12` in a form `0+0*w`
+    pub fn zero<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = Fq6::zero(cs, params);
+        Self::new(zero.clone(), zero)
+    }
+
+    /// Creates a unit `Fq12` in a form `1+0*w`
+    pub fn one<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let one = Fq6::one(cs, params);
+        let zero = Fq6::zero(cs, params);
+        Self::new(one, zero)
+    }
+
+    /// Creates a unit `Fq12` in a form `0+1*w`
+    pub fn one_imaginary<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let one = Fq6::zero(cs, params);
+        let zero = Fq6::one(cs, params);
+        Self::new(one, zero)
+    }
+
+    /// Returns true if the `Fq12` element is zero.
+    pub fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_zero = self.c0.is_zero(cs);
+        let is_c1_zero = self.c1.is_zero(cs);
+        is_c0_zero.and(cs, is_c1_zero)
+    }
+
+    /// Allocate `Fq12` tower extension element from the Witness represented in two components
+    /// from the `Fq6` tower extension.
+    pub fn constant<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = P::convert_from_structured_witness(wit);
+        let c0 = Fq6::constant(cs, c0, params);
+        let c1 = Fq6::constant(cs, c1, params);
+
+        Self::new(c0, c1)
+    }
+
+    /// Allocate `Fq12` tower extension element from the Witness represented in two components
+    /// from the `Fq6` tower extension.
+    pub fn allocate_from_witness<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = P::convert_from_structured_witness(wit);
+        let c0 = Fq6::allocate_from_witness(cs, c0, params);
+        let c1 = Fq6::allocate_from_witness(cs, c1, params);
+
+        Self::new(c0, c1)
+    }
+
+    /// Conjugates the `Fq12` element by negating the `c1` component.
+    pub fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c1 = self.c1.negated(cs);
+        Self::new(self.c0.clone(), c1)
+    }
+
+    #[must_use]
+    pub fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.add(cs, &mut other.c0);
+        let c1 = self.c1.add(cs, &mut other.c1);
+        Self::new(c0, c1)
+    }
+
+    #[must_use]
+    pub fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.double(cs);
+        let c1 = self.c1.double(cs);
+        Self::new(c0, c1)
+    }
+
+    #[must_use]
+    pub fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.negated(cs);
+        let c1 = self.c1.negated(cs);
+        Self::new(c0, c1)
+    }
+
+    #[must_use]
+    pub fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.sub(cs, &mut other.c0);
+        let c1 = self.c1.sub(cs, &mut other.c1);
+        Self::new(c0, c1)
+    }
+
+    #[must_use]
+    pub fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut v0 = self.c0.mul(cs, &mut other.c0);
+        let mut v1 = self.c1.mul(cs, &mut other.c1);
+        let mut o = other.c0.add(cs, &mut other.c1);
+
+        let mut c1 = self.c1.add(cs, &mut self.c0);
+        let mut c1 = c1.mul(cs, &mut o);
+        let mut c1 = c1.sub(cs, &mut v0);
+        let c1 = c1.sub(cs, &mut v1);
+
+        let mut c0 = v1.mul_by_nonresidue(cs);
+        let c0 = c0.add(cs, &mut v0);
+
+        Self::new(c0, c1)
+    }
+
+    #[must_use]
+    pub fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut ab = self.c0.mul(cs, &mut self.c1);
+        let mut c0c1 = self.c0.add(cs, &mut self.c1);
+
+        let mut c0 = self.c1.mul_by_nonresidue(cs);
+        let mut c0 = c0.add(cs, &mut self.c0);
+        let mut c0 = c0.mul(cs, &mut c0c1);
+        let mut c0 = c0.sub(cs, &mut ab);
+
+        let c1 = ab.double(cs);
+        let mut ab_residue = ab.mul_by_nonresidue(cs);
+        let c0 = c0.sub(cs, &mut ab_residue);
+
+        Self::new(c0, c1)
+    }
+
+    pub fn mul_by_c0c1c4<CS>(
+        &mut self,
+        cs: &mut CS,
+        c0: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c1: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c4: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut aa = self.c0.mul_by_c0c1(cs, c0, c1);
+        let mut bb = self.c1.mul_by_c1(cs, c4);
+        let mut o = c1.add(cs, c4);
+
+        let mut new_c1 = self.c1.add(cs, &mut self.c0);
+        let mut new_c1 = new_c1.mul_by_c0c1(cs, c0, &mut o);
+        let mut new_c1 = new_c1.sub(cs, &mut aa);
+        let new_c1 = new_c1.sub(cs, &mut bb);
+
+        let mut new_c0 = bb.mul_by_nonresidue(cs);
+        let new_c0 = new_c0.add(cs, &mut aa);
+
+        Self::new(new_c0, new_c1)
+    }
+
+    /// Sparse multiplication by constants `c0` and `c3` and `c4` in the form `c0 + (c3 + c4*v)*w`.
+    /// See _Algorithm_ 21 from https://eprint.iacr.org/2010/354.pdf.
+    pub fn mul_by_c0c3c4<CS>(
+        &mut self,
+        cs: &mut CS,
+        c0: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c3: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+        c4: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Below, a0+a1*w is self b0+b1*w with b0=b00=c0 and b1=b10+b11*v=c3+c4*v
+        // is the element to multiply with
+
+        // t0 <- a0*b0
+        let mut t0 = self.c0.mul_by_c0(cs, c0);
+        // t1 <- a1*b1
+        let mut t1 = self.c1.mul_by_c0c1(cs, c3, c4);
+        // c0 <- t0 + t1*gamma
+        let mut t1_gamma = t1.mul_by_nonresidue(cs);
+        let new_c0 = t0.add(cs, &mut t1_gamma);
+        // t2 <- (b0+b10)v + b11*v + 0*v^2
+        let mut t2_c0 = c0.add(cs, c3);
+        let mut t2_c1 = c4.clone();
+        // c1 <- (a0 + a1) * t2
+        let mut new_c1 = self.c0.add(cs, &mut self.c1);
+        let mut new_c1 = new_c1.mul_by_c0c1(cs, &mut t2_c0, &mut t2_c1);
+        // c1 <- c1 - t0 - t1
+        let mut new_c1 = new_c1.sub(cs, &mut t0);
+        let new_c1 = new_c1.sub(cs, &mut t1);
+
+        Self::new(new_c0, new_c1)
+    }
+
+    /// Multiplies the `Fq12` element by a constant `c5*v^2*w` represented as `Fq2`.
+    pub fn mul_by_c5<CS>(
+        &mut self,
+        cs: &mut CS,
+        c5: &mut Fq2<F, T, NN, <<P as Extension12Params<T>>::Ex6 as Extension6Params<T>>::Ex2>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Suppose our element is a0+a1*w. Then,
+        // (a0+a1*w)*c5*v^2*w = a1*c5*w^2*v^2 + a0*c5*v^2*w
+        // Notice that w^2*v^2 = v^3 = \xi and therefore our result
+        // is a1*c5*\xi + a0*c5*v^2*w
+
+        // new_c0 <- a1*c5*\xi
+        let mut new_c0 = self.c1.mul_by_c0(cs, c5);
+        new_c0 = new_c0.mul_by_xi(cs);
+
+        // new_c1 <- a0*c5*v^2*w
+        let new_c1 = self.c0.mul_by_c2(cs, c5);
+
+        Self::new(new_c0, new_c1)
+    }
+
+    /// Compute the Frobenius map - raise this element to power.
+    pub fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.frobenius_map(cs, power);
+        let mut c1 = self.c1.frobenius_map(cs, power);
+
+        let c1_c0_frobenius_constant = P::FROBENIUS_COEFFS_C1[power % 12];
+        let c1_c1_frobenius_constant = P::FROBENIUS_COEFFS_C1[power % 12];
+        let c1_c2_frobenius_constant = P::FROBENIUS_COEFFS_C1[power % 12];
+
+        let params = c1.c0.get_params();
+
+        let mut c1_c0_frobenius_coeff = Fq2::constant(cs, c1_c0_frobenius_constant, params);
+        let mut c1_c1_frobenius_coeff = Fq2::constant(cs, c1_c1_frobenius_constant, params);
+        let mut c1_c2_frobenius_coeff = Fq2::constant(cs, c1_c2_frobenius_constant, params);
+
+        let c1_c0 = c1.c0.mul(cs, &mut c1_c0_frobenius_coeff);
+        let c1_c1 = c1.c1.mul(cs, &mut c1_c1_frobenius_coeff);
+        let c1_c2 = c1.c2.mul(cs, &mut c1_c2_frobenius_coeff);
+
+        let c1 = Fq6::new(c1_c0, c1_c1, c1_c2);
+
+        Self::new(c0, c1)
+    }
+
+    pub fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut c0s = self.c0.square(cs);
+        let mut c1s = self.c1.square(cs);
+        let mut c1s = c1s.mul_by_nonresidue(cs);
+        let mut c0s = c0s.sub(cs, &mut c1s);
+
+        c0s.normalize(cs);
+        let mut t = c0s.inverse(cs);
+        let c0_new = t.mul(cs, &mut self.c0);
+        let mut c1_new = t.mul(cs, &mut self.c1);
+        let c1_new = c1_new.negated(cs);
+
+        Self::new(c0_new, c1_new)
+    }
+
+    pub fn div<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut t = other.inverse(cs);
+        self.mul(cs, &mut t)
+    }
+}
+
+impl<F, T, NN, P> CSAllocatable<F> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    type Witness = (
+        <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::Witness,
+        <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::Witness,
+    );
+
+    #[inline(always)]
+    fn placeholder_witness() -> Self::Witness {
+        (
+            <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::placeholder_witness(),
+            <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::placeholder_witness(),
+        )
+    }
+
+    #[inline(always)]
+    fn allocate_without_value<CS>(cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate_without_value(cs);
+        let c1 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate_without_value(cs);
+
+        Self::new(c0, c1)
+    }
+
+    #[inline(always)]
+    fn allocate<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = witness;
+
+        let c0 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate(cs, c0);
+        let c1 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate(cs, c1);
+
+        Self::new(c0, c1)
+    }
+
+    #[inline(always)]
+    fn allocate_constant<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = witness;
+
+        let c0 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate_constant(cs, c0);
+        let c1 = <Fq6<F, T, NN, P::Ex6> as CSAllocatable<F>>::allocate_constant(cs, c1);
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, T, NN, P> WitnessHookable<F> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    fn witness_hook<CS>(&self, cs: &CS) -> Box<dyn FnOnce() -> Option<Self::Witness> + 'static>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.witness_hook(cs);
+        let c1 = self.c1.witness_hook(cs);
+
+        Box::new(move || {
+            let c0 = c0()?;
+            let c1 = c1()?;
+
+            Some((c0, c1))
+        })
+    }
+}
+
+impl<F, T, NN, P> CSPlaceholder<F> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CSPlaceholder<F>,
+    P: Extension12Params<T>,
+{
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let placeholder = <Fq6<F, T, NN, P::Ex6> as CSPlaceholder<F>>::placeholder(cs);
+
+        Self::new(placeholder.clone(), placeholder.clone())
+    }
+}
+
+impl<F, T, NN, P> CircuitVarLengthEncodable<F> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CircuitVarLengthEncodable<F>,
+    P: Extension12Params<T>,
+{
+    fn encoding_length(&self) -> usize {
+        self.c0.encoding_length() + self.c1.encoding_length()
+    }
+
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        self.c0.encode_to_buffer(cs, dst);
+        self.c1.encode_to_buffer(cs, dst);
+    }
+}
+
+impl<F, T, NN, P> NonNativeField<F, T> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    type Params = NN::Params;
+
+    fn get_params(&self) -> &Arc<Self::Params> {
+        self.c0.get_params()
+    }
+
+    fn allocated_constant<CS>(cs: &mut CS, value: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = NN::allocated_constant(cs, T::zero(), params);
+        let c0 = NN::allocated_constant(cs, value, params);
+        let c0 = Fq2::new(c0, zero);
+        let c0 = Fq6::new(c0, Fq2::zero(cs, params), Fq2::zero(cs, params));
+        let c1 = Fq6::zero(cs, params);
+
+        Self::new(c0, c1)
+    }
+
+    fn allocate_checked<CS>(cs: &mut CS, witness: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = NN::allocate_checked(cs, T::zero(), params);
+        let c0 = NN::allocate_checked(cs, witness, params);
+        let c0 = Fq2::new(c0, zero);
+        let c0 = Fq6::new(c0, Fq2::zero(cs, params), Fq2::zero(cs, params));
+        let c1 = Fq6::zero(cs, params);
+
+        Self::new(c0, c1)
+    }
+
+    fn allocate_checked_without_value<CS>(cs: &mut CS, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = Fq6::allocate_checked_without_value(cs, params);
+        let c1 = Fq6::allocate_checked_without_value(cs, params);
+
+        Self::new(c0, c1)
+    }
+
+    fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.is_zero(cs)
+    }
+
+    fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.negated(cs)
+    }
+
+    fn equals<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_equal = self.c0.equals(cs, &mut other.c0);
+        let is_c1_equal = self.c1.equals(cs, &mut other.c1);
+        is_c0_equal.and(cs, is_c1_equal)
+    }
+
+    fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn lazy_add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn add_many_lazy<CS, const M: usize>(cs: &mut CS, inputs: [&mut Self; M]) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        assert!(M != 0, "add_many_lazy: inputs must not be empty");
+
+        let params = inputs[0].get_params();
+        let mut result = Self::zero(cs, params);
+
+        for i in 0..M {
+            result = result.add(cs, inputs[i]);
+        }
+
+        result
+    }
+
+    fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn lazy_sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn lazy_double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.mul(cs, other)
+    }
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.square(cs)
+    }
+
+    fn div_unchecked<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.div(cs, other)
+    }
+
+    #[allow(unused_variables)]
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let c0 = <Fq6<F, T, NN, <P as Extension12Params<T>>::Ex6>>::conditionally_select(
+            cs, flag, &a.c0, &b.c0,
+        );
+        let c1 = <Fq6<F, T, NN, <P as Extension12Params<T>>::Ex6>>::conditionally_select(
+            cs, flag, &a.c1, &b.c1,
+        );
+
+        Self::new(c0, c1)
+    }
+
+    #[allow(unused_variables)]
+    fn allocate_inverse_or_zero<CS>(&self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // TODO: Make check for zero.
+        let mut self_cloned = self.clone();
+        self_cloned.inverse(cs)
+    }
+
+    fn inverse_unchecked<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    #[allow(unused_variables)]
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.normalize(cs);
+        self.c1.normalize(cs);
+    }
+
+    fn mask<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask(cs, masking_bit);
+        let c1 = self.c1.mask(cs, masking_bit);
+
+        Self::new(c0, c1)
+    }
+
+    fn mask_negated<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask_negated(cs, masking_bit);
+        let c1 = self.c1.mask_negated(cs, masking_bit);
+
+        Self::new(c0, c1)
+    }
+
+    fn enforce_reduced<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.enforce_reduced(cs);
+        self.c1.enforce_reduced(cs);
+    }
+
+    fn enforce_equal<CS>(cs: &mut CS, a: &Self, b: &Self)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        Fq6::enforce_equal(cs, &a.c0, &b.c0);
+        Fq6::enforce_equal(cs, &a.c1, &b.c1);
+    }
+}
+
+impl<F, NN> Selectable<F> for Fq12<F, BN256Fq, NN, BN256Extension12Params>
+where
+    F: SmallField,
+    NN: NonNativeField<F, BN256Fq>,
+{
+    fn conditionally_select<CS>(cs: &mut CS, flag: Boolean<F>, a: &Self, b: &Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 =
+            <Fq6<F, BN256Fq, NN, BN256Extension6Params> as Selectable<F>>::conditionally_select(
+                cs, flag, &a.c0, &b.c0,
+            );
+        let c1 =
+            <Fq6<F, BN256Fq, NN, BN256Extension6Params> as Selectable<F>>::conditionally_select(
+                cs, flag, &a.c1, &b.c1,
+            );
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, T, NN, P> HardexpCompatible<F> for Fq12<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension12Params<T>,
+{
+    fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.conjugate(cs)
+    }
+
+    fn pow_u32<CS, S: AsRef<[u64]>>(&mut self, cs: &mut CS, exponent: S) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.pow_u32(cs, exponent)
+    }
+
+    fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.frobenius_map(cs, power)
+    }
+
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.mul(cs, other)
+    }
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.square(cs)
+    }
+
+    fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.normalize(cs);
+        self.c1.normalize(cs);
+    }
+}
diff --git a/src/gadgets/tower_extension/fq2.rs b/src/gadgets/tower_extension/fq2.rs
new file mode 100644
index 0000000..93b0e18
--- /dev/null
+++ b/src/gadgets/tower_extension/fq2.rs
@@ -0,0 +1,657 @@
+use std::sync::Arc;
+
+use pairing::{
+    bn256::{Fq as BN256Fq, Fq2 as BN256Fq2, G2Affine},
+    ff::PrimeField,
+};
+
+use super::params::{bn256::BN256Extension2Params, Extension2Params};
+
+use crate::cs::Variable;
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+use crate::{
+    cs::traits::cs::ConstraintSystem,
+    field::SmallField,
+    gadgets::{
+        boolean::Boolean,
+        non_native_field::traits::{CurveCompatibleNonNativeField, NonNativeField},
+        traits::{
+            allocatable::CSAllocatable, selectable::Selectable, witnessable::WitnessHookable,
+        },
+    },
+};
+
+/// BN256Fq2Params represents a pair of elements in the extension field `Fq2=Fq[u]/(u^2-beta)`
+/// where `beta^2=-1`. The implementation is primarily based on the following paper:
+/// https://eprint.iacr.org/2006/471.pdf.
+#[derive(Clone, Debug, Copy)]
+pub struct Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension2Params<T>,
+{
+    pub c0: NN,
+    pub c1: NN,
+    wit: Option<P::Witness>,
+    _marker: std::marker::PhantomData<(F, T, P)>,
+}
+
+impl<F, T, NN, P> Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension2Params<T>,
+{
+    /// Creates a new `Fq2` element from two `Fq` components.
+    pub fn new(c0: NN, c1: NN) -> Self {
+        Self {
+            c0,
+            c1,
+            wit: Option::None, // to get placeholder_witness we need CS
+            _marker: std::marker::PhantomData::<(F, T, P)>,
+        }
+    }
+
+    /// Creates a new `Fq2` in a form `0+0*u`
+    pub fn zero<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = NN::allocated_constant(cs, T::zero(), params);
+
+        Self::new(zero.clone(), zero)
+    }
+
+    /// Creates a new `Fq2` in a form `1+0*u`
+    pub fn one<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let one = NN::allocated_constant(cs, T::one(), params);
+        let zero = NN::allocated_constant(cs, T::zero(), params);
+
+        Self::new(one, zero)
+    }
+
+    /// Adds two elements of `Fq2` by adding their components elementwise.
+    #[must_use]
+    pub fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.add(cs, &mut other.c0);
+        let c1 = self.c1.add(cs, &mut other.c1);
+        Self::new(c0, c1)
+    }
+
+    /// Returns whether the element of `Fq2` is zero.
+    pub fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_zero = self.c0.is_zero(cs);
+        let is_c1_zero = self.c1.is_zero(cs);
+        is_c0_zero.and(cs, is_c1_zero)
+    }
+
+    /// Doubles the element of `Fq2` by doubling its components.
+    #[must_use]
+    pub fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.double(cs);
+        let c1 = self.c1.double(cs);
+        Self::new(c0, c1)
+    }
+
+    /// Negates the element of `Fq2` by negating its components.
+    #[must_use]
+    pub fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.negated(cs);
+        let c1 = self.c1.negated(cs);
+        Self::new(c0, c1)
+    }
+
+    /// Conjugates the element `c=c0+c1*u` by computing `c=c0-c1*u`.
+    #[must_use]
+    pub fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c1 = self.c1.negated(cs);
+        Self::new(self.c0.clone(), c1)
+    }
+
+    /// Subtracts two elements of `Fq2` by subtracting their components elementwise.
+    #[must_use]
+    pub fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.sub(cs, &mut other.c0);
+        let c1 = self.c1.sub(cs, &mut other.c1);
+        Self::new(c0, c1)
+    }
+
+    /// Multiply the element `a=a0+a1*u` by the element `b=b0+b1*u` using the Karatsuba method.
+    #[must_use]
+    pub fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // v0 <- a0*b0, v1 <- a1*b1
+        let mut v0 = self.c0.mul(cs, &mut other.c0);
+        let mut v1 = self.c1.mul(cs, &mut other.c1);
+
+        // c0 <- v0 + beta*v1
+        let c0 = v0.sub(cs, &mut v1);
+
+        // c1 <- (a0 + a1)(b0 + b1) - v0 - v1
+        let mut a0_plus_a1 = self.c0.add(cs, &mut self.c1);
+        let mut b0_plus_b1 = other.c0.add(cs, &mut other.c1);
+        let mut c1 = a0_plus_a1.mul(cs, &mut b0_plus_b1);
+        let mut c1 = c1.sub(cs, &mut v0);
+        let c1 = c1.sub(cs, &mut v1);
+
+        Self::new(c0, c1)
+    }
+
+    /// Square the element `a=a0+a1*u` by using the Karatsuba method.
+    #[must_use]
+    pub fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // v0 <- a0^2, v1 <- a1^2
+        let mut v0 = self.c0.square(cs);
+        let mut v1 = self.c1.square(cs);
+
+        // c0 <- v0 + beta*v1
+        let c0 = v0.sub(cs, &mut v1);
+
+        // c1 <- (a0 + a1)^2 - v0 - v1
+        let mut a0_plus_a1 = self.c0.add(cs, &mut self.c1);
+        let mut c1 = a0_plus_a1.square(cs);
+        let mut c1 = c1.sub(cs, &mut v0);
+        let c1 = c1.sub(cs, &mut v1);
+
+        Self::new(c0, c1)
+    }
+
+    /// Multiply the element `a=a0+a1*u` by the element in the base field `Fq`.
+    #[must_use]
+    pub fn mul_c0<CS>(&mut self, cs: &mut CS, c0: &mut NN) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // a*f = (a0 + a1*u)*f = (a0*f) + (a1*f)*u
+        let new_c0 = self.c0.mul(cs, c0);
+        let new_c1 = self.c1.mul(cs, c0);
+        Self::new(new_c0, new_c1)
+    }
+
+    /// Finds the inverse of the element `a=a0+a1*u` in the extension field `Fq2`.
+    #[must_use]
+    pub fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut t0 = self.c0.square(cs);
+        let mut t1 = self.c1.square(cs);
+        let mut t0 = t0.add(cs, &mut t1);
+        let mut t = t0.inverse_unchecked(cs);
+
+        let c0 = self.c0.mul(cs, &mut t);
+        let mut c1 = self.c1.mul(cs, &mut t);
+        let c1 = c1.negated(cs);
+
+        Self::new(c0, c1)
+    }
+
+    /// Divides the element `a=a0+a1*u` by the element `b=b0+b1*u` in the extension field `Fq2`.
+    #[must_use]
+    pub fn div<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut inv = other.inverse(cs);
+        self.mul(cs, &mut inv)
+    }
+
+    /// Multiply this element by quadratic nonresidue 9 + u.
+    pub fn mul_by_nonresidue<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Finding 8(a0 + a1*u)
+        let mut new = self.double(cs);
+        new = new.double(cs);
+        new = new.double(cs);
+
+        // c0 <- 9*c0 - c1
+        let mut c0 = new.c0.add(cs, &mut self.c0);
+        let c0 = c0.sub(cs, &mut self.c1);
+
+        // c1 <- c0 + 9*c1
+        let mut c1 = new.c1.add(cs, &mut self.c1);
+        let c1 = c1.add(cs, &mut self.c0);
+
+        Self::new(c0, c1)
+    }
+
+    /// Compute the Frobenius map - raise this element to power.
+    pub fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_even = Boolean::allocated_constant(cs, power % 2 == 0);
+
+        // TODO: check what non-residue == -1.
+
+        let c0 = self.c0.clone();
+        let c1 = self.c1.negated(cs);
+
+        // TODO: assert what Fp2 under CS computes frobenius map same as without CS and this optimizational hack.
+
+        <Fq2<F, T, NN, P> as NonNativeField<F, T>>::conditionally_select(
+            cs,
+            is_even,
+            &self.clone(),
+            &Self::new(c0, c1),
+        )
+    }
+
+    /// Allocate `Fq2` tower extension element from the Witness represented in two PrimeField components `c0` and `c1`.
+    pub fn constant<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = P::convert_from_structured_witness(wit);
+
+        let c0 = NN::allocated_constant(cs, c0, params);
+        let c1 = NN::allocated_constant(cs, c1, params);
+
+        Self::new(c0, c1)
+    }
+
+    /// Allocate `Fq2` tower extension element from the Witness represented in two PrimeField components `c0` and `c1`.
+    pub fn allocate_from_witness<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = P::convert_from_structured_witness(wit);
+
+        let c0 = NN::allocate_checked(cs, c0, params);
+        let c1 = NN::allocate_checked(cs, c1, params);
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, T, NN, P> CSAllocatable<F> for Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension2Params<T>,
+{
+    type Witness = (NN::Witness, NN::Witness);
+
+    #[inline(always)]
+    fn placeholder_witness() -> Self::Witness {
+        (NN::placeholder_witness(), NN::placeholder_witness())
+    }
+
+    #[inline(always)]
+    fn allocate_without_value<CS>(cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocate_without_value(cs);
+        let c1 = NN::allocate_without_value(cs);
+
+        Self::new(c0, c1)
+    }
+
+    #[inline(always)]
+    fn allocate<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = witness;
+
+        let c0 = NN::allocate(cs, c0);
+        let c1 = NN::allocate(cs, c1);
+
+        Self::new(c0, c1)
+    }
+
+    #[inline(always)]
+    fn allocate_constant<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1) = witness;
+
+        let c0 = NN::allocate_constant(cs, c0);
+        let c1 = NN::allocate_constant(cs, c1);
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, T, NN, P> WitnessHookable<F> for Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension2Params<T>,
+{
+    fn witness_hook<CS>(&self, cs: &CS) -> Box<dyn FnOnce() -> Option<Self::Witness> + 'static>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.witness_hook(cs);
+        let c1 = self.c1.witness_hook(cs);
+
+        Box::new(move || {
+            let c0 = c0()?;
+            let c1 = c1()?;
+
+            Some((c0, c1))
+        })
+    }
+}
+
+impl<F, T, NN, P> CSPlaceholder<F> for Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CSPlaceholder<F>,
+    P: Extension2Params<T>,
+{
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let c0 = NN::placeholder(cs);
+        let c1 = NN::placeholder(cs);
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, T, NN, P> CircuitVarLengthEncodable<F> for Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CircuitVarLengthEncodable<F>,
+    P: Extension2Params<T>,
+{
+    fn encoding_length(&self) -> usize {
+        self.c0.encoding_length() + self.c1.encoding_length()
+    }
+
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        self.c0.encode_to_buffer(cs, dst);
+        self.c1.encode_to_buffer(cs, dst);
+    }
+}
+
+impl<F, T, NN, P> NonNativeField<F, T> for Fq2<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension2Params<T>,
+{
+    type Params = NN::Params;
+
+    fn get_params(&self) -> &Arc<Self::Params> {
+        self.c0.get_params()
+    }
+
+    fn allocated_constant<CS>(cs: &mut CS, value: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocated_constant(cs, value, params);
+        let c1 = NN::allocated_constant(cs, T::zero(), params);
+
+        Self::new(c0, c1)
+    }
+
+    fn allocate_checked<CS>(cs: &mut CS, witness: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocate_checked(cs, witness, params);
+        let c1 = NN::allocate_checked(cs, witness, params);
+
+        Self::new(c0, c1)
+    }
+
+    fn allocate_checked_without_value<CS>(cs: &mut CS, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocate_checked_without_value(cs, params);
+        let c1 = NN::allocate_checked_without_value(cs, params);
+
+        Self::new(c0, c1)
+    }
+
+    fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.is_zero(cs)
+    }
+
+    fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.negated(cs)
+    }
+
+    fn equals<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_equal = self.c0.equals(cs, &mut other.c0);
+        let is_c1_equal = self.c1.equals(cs, &mut other.c1);
+        is_c0_equal.and(cs, is_c1_equal)
+    }
+
+    fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn lazy_add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn add_many_lazy<CS, const M: usize>(cs: &mut CS, inputs: [&mut Self; M]) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        assert!(M != 0, "add_many_lazy: inputs must not be empty");
+
+        let params = inputs[0].get_params();
+        let mut result = Self::zero(cs, params);
+
+        for i in 0..M {
+            result = result.add(cs, inputs[i]);
+        }
+
+        result
+    }
+
+    fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn lazy_sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn lazy_double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.mul(cs, other)
+    }
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.square(cs)
+    }
+
+    fn div_unchecked<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.div(cs, other)
+    }
+
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let c0 = NN::conditionally_select(cs, flag, &a.c0, &b.c0);
+        let c1 = NN::conditionally_select(cs, flag, &a.c1, &b.c1);
+
+        Self::new(c0, c1)
+    }
+
+    #[allow(unused_variables)]
+    fn allocate_inverse_or_zero<CS>(&self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // TODO: Make check for zero.
+        let mut self_cloned = self.clone();
+        self_cloned.inverse(cs)
+    }
+
+    fn inverse_unchecked<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    #[allow(unused_variables)]
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.normalize(cs);
+        self.c1.normalize(cs);
+    }
+
+    fn mask<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask(cs, masking_bit);
+        let c1 = self.c1.mask(cs, masking_bit);
+
+        Self::new(c0, c1)
+    }
+
+    fn mask_negated<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask_negated(cs, masking_bit);
+        let c1 = self.c1.mask_negated(cs, masking_bit);
+
+        Self::new(c0, c1)
+    }
+
+    fn enforce_reduced<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.enforce_reduced(cs);
+        self.c1.enforce_reduced(cs);
+    }
+
+    fn enforce_equal<CS>(cs: &mut CS, a: &Self, b: &Self)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        NN::enforce_equal(cs, &a.c0, &b.c0);
+        NN::enforce_equal(cs, &a.c1, &b.c1);
+    }
+}
+
+impl<F, NN> Selectable<F> for Fq2<F, BN256Fq, NN, BN256Extension2Params>
+where
+    F: SmallField,
+    NN: NonNativeField<F, BN256Fq>,
+{
+    fn conditionally_select<CS>(cs: &mut CS, flag: Boolean<F>, a: &Self, b: &Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::conditionally_select(cs, flag, &a.c0, &b.c0);
+        let c1 = NN::conditionally_select(cs, flag, &a.c1, &b.c1);
+
+        Self::new(c0, c1)
+    }
+}
+
+impl<F, NN> CurveCompatibleNonNativeField<F, BN256Fq, G2Affine>
+    for Fq2<F, BN256Fq, NN, BN256Extension2Params>
+where
+    F: SmallField,
+    NN: NonNativeField<F, BN256Fq>,
+{
+    fn from_curve_base<CS>(cs: &mut CS, point: &BN256Fq2, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocated_constant(cs, point.c0, params);
+        let c1 = NN::allocated_constant(cs, point.c1, params);
+
+        Self::new(c0, c1)
+    }
+}
diff --git a/src/gadgets/tower_extension/fq6.rs b/src/gadgets/tower_extension/fq6.rs
new file mode 100644
index 0000000..e68ca98
--- /dev/null
+++ b/src/gadgets/tower_extension/fq6.rs
@@ -0,0 +1,860 @@
+use std::sync::Arc;
+
+use pairing::{bn256::Fq as BN256Fq, ff::PrimeField};
+
+use super::{
+    fq2::Fq2,
+    params::{
+        bn256::{BN256Extension2Params, BN256Extension6Params},
+        Extension6Params,
+    },
+};
+
+use crate::cs::Variable;
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+use crate::{
+    cs::traits::cs::ConstraintSystem,
+    field::SmallField,
+    gadgets::{
+        boolean::Boolean,
+        non_native_field::traits::NonNativeField,
+        traits::{
+            allocatable::CSAllocatable, selectable::Selectable, witnessable::WitnessHookable,
+        },
+    },
+};
+
+/// `Fq6` field extension implementation in the constraint system. It is implemented
+/// as `Fq2[v]/(v^3-xi)` where `xi=9+u`. In other words,
+/// it is a set of quadratic polynomials of a form `c0+c1*v+c2*v^2`,
+///  where `c0`, `c1`, `c2` are elements of `Fq2`.
+/// See https://hackmd.io/@jpw/bn254#Field-extension-towers for reference. For
+/// implementation reference, see https://eprint.iacr.org/2006/471.pdf.
+#[derive(Clone, Debug, Copy)]
+pub struct Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension6Params<T>,
+{
+    pub c0: Fq2<F, T, NN, P::Ex2>,
+    pub c1: Fq2<F, T, NN, P::Ex2>,
+    pub c2: Fq2<F, T, NN, P::Ex2>,
+    _marker: std::marker::PhantomData<(F, T)>,
+}
+
+impl<F, T, NN, P> Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: pairing::ff::PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension6Params<T>,
+{
+    /// Creates a new `Fq6` element from three `Fq2` components.
+    pub fn new(
+        c0: Fq2<F, T, NN, P::Ex2>,
+        c1: Fq2<F, T, NN, P::Ex2>,
+        c2: Fq2<F, T, NN, P::Ex2>,
+    ) -> Self {
+        Self {
+            c0,
+            c1,
+            c2,
+            _marker: std::marker::PhantomData::<(F, T)>,
+        }
+    }
+
+    /// Creates a new zero `Fq6` in a form `0+0*v+0*v^2`
+    pub fn zero<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = Fq2::zero(cs, params);
+        Self::new(zero.clone(), zero.clone(), zero)
+    }
+
+    /// Creates a unit `Fq6` in a form `1+0*v+0*v^2`
+    pub fn one<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let one = Fq2::one(cs, params);
+        let zero = Fq2::zero(cs, params);
+        Self::new(one, zero.clone(), zero)
+    }
+
+    /// Returns the `\gamma`: square root of `w`, being just a `0+1*v+0*v^2` element.
+    pub fn gamma<CS>(cs: &mut CS, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let one = Fq2::one(cs, params);
+        let zero = Fq2::zero(cs, params);
+        Self::new(zero.clone(), one, zero)
+    }
+
+    /// Returns `Fq6::one()` if `b` is true, and `Fq6::zero()` if `b` is false.
+    pub fn from_boolean<CS>(cs: &mut CS, b: Boolean<F>, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let zero = Self::zero(cs, params);
+        let one = Self::one(cs, params);
+        Self::conditionally_select(cs, b, &one, &zero)
+    }
+
+    /// Returns true if the `Fq6` element is zero.
+    pub fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_zero = self.c0.is_zero(cs);
+        let is_c1_zero = self.c1.is_zero(cs);
+        let is_c2_zero = self.c2.is_zero(cs);
+        Boolean::multi_and(cs, &[is_c0_zero, is_c1_zero, is_c2_zero])
+    }
+
+    /// Adds two elements of `Fq6` by adding their components elementwise.
+    #[must_use]
+    pub fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.add(cs, &mut other.c0);
+        let c1 = self.c1.add(cs, &mut other.c1);
+        let c2 = self.c2.add(cs, &mut other.c2);
+        Self::new(c0, c1, c2)
+    }
+
+    /// Doubles the element of `Fq6` by doubling its components.
+    #[must_use]
+    pub fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.double(cs);
+        let c1 = self.c1.double(cs);
+        let c2 = self.c2.double(cs);
+        Self::new(c0, c1, c2)
+    }
+
+    /// Negates the element of `Fq6` by negating its components.
+    #[must_use]
+    pub fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.negated(cs);
+        let c1 = self.c1.negated(cs);
+        let c2 = self.c2.negated(cs);
+        Self::new(c0, c1, c2)
+    }
+
+    /// Subtracts two elements of `Fq6` by subtracting their components elementwise.
+    #[must_use]
+    pub fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.sub(cs, &mut other.c0);
+        let c1 = self.c1.sub(cs, &mut other.c1);
+        let c2 = self.c2.sub(cs, &mut other.c2);
+        Self::new(c0, c1, c2)
+    }
+
+    /// Multiplies the element in `Fq6` by a non-residue `v`.
+    pub fn mul_by_nonresidue<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // c0, c1, c2 -> c2, c0, c1
+        let new_c2 = self.c2.mul_by_nonresidue(cs);
+        Self::new(new_c2, self.c0.clone(), self.c1.clone())
+    }
+
+    /// Multiplies the element in `Fq6` by a non-residue `\xi=9+u`.
+    pub fn mul_by_xi<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let new_c0 = self.c0.mul_by_nonresidue(cs);
+        let new_c1 = self.c1.mul_by_nonresidue(cs);
+        let new_c2 = self.c2.mul_by_nonresidue(cs);
+
+        Self::new(new_c0, new_c1, new_c2)
+    }
+
+    /// Multiplies two elements `a=a0+a1*v+a2*v^2`
+    /// and `b=b0+b1*v+b2*v^2` in `Fq6` using Karatsuba multiplication.
+    #[must_use]
+    pub fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut v0 = self.c0.mul(cs, &mut other.c0);
+        let mut v1 = self.c1.mul(cs, &mut other.c1);
+        let mut v2 = self.c2.mul(cs, &mut other.c2);
+
+        let mut t1 = other.c1.add(cs, &mut other.c2);
+        let mut tmp = self.c1.add(cs, &mut self.c2);
+
+        let mut t1 = t1.mul(cs, &mut tmp);
+        let mut t1 = t1.sub(cs, &mut v1);
+        let mut t1 = t1.sub(cs, &mut v2);
+        let mut t1 = t1.mul_by_nonresidue(cs);
+        let t1 = t1.add(cs, &mut v0);
+
+        let mut t3 = other.c0.add(cs, &mut other.c2);
+        let mut tmp = self.c0.add(cs, &mut self.c2);
+        let mut t3 = t3.mul(cs, &mut tmp);
+        let mut t3 = t3.sub(cs, &mut v0);
+        let mut t3 = t3.add(cs, &mut v1);
+        let t3 = t3.sub(cs, &mut v2);
+
+        let mut t2 = other.c0.add(cs, &mut other.c1);
+        let mut tmp = self.c0.add(cs, &mut self.c1);
+        let mut t2 = t2.mul(cs, &mut tmp);
+        let mut t2 = t2.sub(cs, &mut v0);
+        let mut t2 = t2.sub(cs, &mut v1);
+        let mut v2 = v2.mul_by_nonresidue(cs);
+        let t2 = t2.add(cs, &mut v2);
+
+        Self::new(t1, t2, t3)
+    }
+
+    /// Squares the element `a=a0+a1*v+a2*v^2` in `Fq6` using Karatsuba squaring.
+    #[must_use]
+    pub fn square<CS: ConstraintSystem<F>>(&mut self, cs: &mut CS) -> Self {
+        // v0 <- a0^2, v1 <- a1^2, v2 <- a2^2
+        let mut v0 = self.c0.square(cs);
+        let mut v1 = self.c1.square(cs);
+        let mut v2 = self.c2.square(cs);
+
+        // c0 <- v0 + xi*((a1 + a2)^2 - v1 - v2)
+        let mut a1_plus_a2 = self.c1.add(cs, &mut self.c2);
+        let mut c0 = a1_plus_a2.square(cs);
+        let mut c0 = c0.sub(cs, &mut v1);
+        let mut c0 = c0.sub(cs, &mut v2);
+        let mut c0 = c0.mul_by_nonresidue(cs);
+        let c0 = c0.add(cs, &mut v0);
+
+        // c1 <- (a0 + a1)^2 - v0 - v1 + xi*v2
+        let mut a0_plus_a1 = self.c0.add(cs, &mut self.c1);
+        let mut c1 = a0_plus_a1.square(cs);
+        let mut c1 = c1.sub(cs, &mut v0);
+        let mut c1 = c1.sub(cs, &mut v1);
+        let mut xi_v2 = v2.mul_by_nonresidue(cs);
+        let c1 = c1.add(cs, &mut xi_v2);
+
+        // c2 <- (a0 + a2)^2 - v0 + v1 - v2
+        let mut a0_plus_a2 = self.c0.add(cs, &mut self.c2);
+        let mut c2 = a0_plus_a2.square(cs);
+        let mut c2 = c2.sub(cs, &mut v0);
+        let mut c2 = c2.add(cs, &mut v1);
+        let c2 = c2.sub(cs, &mut v2);
+
+        Self::new(c0, c1, c2)
+    }
+
+    /// Multiplies the element `a=a0+a1*v+a2*v^2` in `Fq6` by the element `b = b1*v`
+    pub fn mul_by_c1<CS>(&mut self, cs: &mut CS, c1: &mut Fq2<F, T, NN, P::Ex2>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut b_b = self.c1.mul(cs, c1);
+        let mut tmp = self.c1.add(cs, &mut self.c2);
+
+        let mut t1 = c1.mul(cs, &mut tmp);
+        let mut t1 = t1.sub(cs, &mut b_b);
+        let t1 = t1.mul_by_nonresidue(cs);
+
+        let mut tmp = self.c0.add(cs, &mut self.c1);
+        let mut t2 = c1.mul(cs, &mut tmp);
+        let t2 = t2.sub(cs, &mut b_b);
+
+        Self::new(t1, t2, b_b)
+    }
+
+    /// Multiplies the element `a=a0+a1*v+a2*v^2` in `Fq6` by the element in `NonNativeField`
+    pub fn mul_by_fq<CS>(&mut self, cs: &mut CS, c0: &mut NN) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Simply multiply element-wise
+        let t0 = self.c0.mul_c0(cs, c0);
+        let t1 = self.c1.mul_c0(cs, c0);
+        let t2 = self.c2.mul_c0(cs, c0);
+
+        Self::new(t0, t1, t2)
+    }
+
+    /// Multiplies the element `a=a0+a1*v+a2*v^2` in `Fq6` by the element `c=c0` in `Fq2`
+    pub fn mul_by_c0<CS>(&mut self, cs: &mut CS, c0: &mut Fq2<F, T, NN, P::Ex2>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Simply multiply element-wise
+        let t0 = self.c0.mul(cs, c0);
+        let t1 = self.c1.mul(cs, c0);
+        let t2 = self.c2.mul(cs, c0);
+
+        Self::new(t0, t1, t2)
+    }
+
+    /// Multiplies the element `a=a0+a1*v+a2*v^2` in `Fq6` by the element `c2*v^2`
+    pub fn mul_by_c2<CS>(&mut self, cs: &mut CS, c2: &mut Fq2<F, T, NN, P::Ex2>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // Suppose a = a0 + a1*v + a2*v^2. In this case,
+        // (a0 + a1*v + a2*v^2) * c2 * v^2 =
+        // a1*c2*\xi + a2*c2*\xi*v + a0*c2*v^2
+        // NOTE: There might be a better way to calculate three coefficients
+        // without using 3 multiplications and 2 mul_by_nonresidues, similarly to mul_by_c1
+
+        // Setting coefficients
+        let mut a0 = self.c0.clone();
+        let mut a1 = self.c1.clone();
+        let mut a2 = self.c2.clone();
+
+        // new_c0 <- a1*c2*\xi
+        let mut new_c0 = a1.mul(cs, c2);
+        new_c0 = new_c0.mul_by_nonresidue(cs);
+
+        // new_c1 <- a2*c2*\xi
+        let mut new_c1 = a2.mul(cs, c2);
+        new_c1 = new_c1.mul_by_nonresidue(cs);
+
+        // new_c2 <- a0*c2
+        let new_c2 = a0.mul(cs, c2);
+
+        Self::new(new_c0, new_c1, new_c2)
+    }
+
+    /// Multiplies the element `a=a0+a1*v+a2*v^2` in `Fq6` by the element `b = b0+b1*v`
+    pub fn mul_by_c0c1<CS>(
+        &mut self,
+        cs: &mut CS,
+        c0: &mut Fq2<F, T, NN, P::Ex2>,
+        c1: &mut Fq2<F, T, NN, P::Ex2>,
+    ) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut a_a = self.c0.mul(cs, c0);
+        let mut b_b = self.c1.mul(cs, c1);
+
+        let mut tmp = self.c1.add(cs, &mut self.c2);
+        let mut t1 = c1.mul(cs, &mut tmp);
+        let mut t1 = t1.sub(cs, &mut b_b);
+        let mut t1 = t1.mul_by_nonresidue(cs);
+        let t1 = t1.add(cs, &mut a_a);
+
+        let mut tmp = self.c0.add(cs, &mut self.c2);
+        let mut t3 = c0.mul(cs, &mut tmp);
+        let mut t3 = t3.sub(cs, &mut a_a);
+        let t3 = t3.add(cs, &mut b_b);
+
+        let mut t2 = c0.add(cs, c1);
+        let mut tmp = self.c0.add(cs, &mut self.c1);
+        let mut t2 = t2.mul(cs, &mut tmp);
+        let mut t2 = t2.sub(cs, &mut a_a);
+        let t2 = t2.sub(cs, &mut b_b);
+
+        Self::new(t1, t2, t3)
+    }
+
+    /// Find the inverse element in Fq6
+    pub fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut c0 = self.c2.mul_by_nonresidue(cs);
+        let mut c0 = c0.mul(cs, &mut self.c1);
+        let mut c0 = c0.negated(cs);
+
+        let mut c0s = self.c0.square(cs);
+        let mut c0 = c0.add(cs, &mut c0s);
+
+        let mut c1 = self.c2.square(cs);
+        let mut c1 = c1.mul_by_nonresidue(cs);
+
+        let mut c01 = self.c0.mul(cs, &mut self.c1);
+        let mut c1 = c1.sub(cs, &mut c01);
+
+        let mut c2 = self.c1.square(cs);
+        let mut c02 = self.c0.mul(cs, &mut self.c2);
+        let mut c2 = c2.sub(cs, &mut c02);
+
+        let mut tmp1 = self.c2.mul(cs, &mut c1);
+        let mut tmp2 = self.c1.mul(cs, &mut c2);
+        let mut tmp1 = tmp1.add(cs, &mut tmp2);
+        let mut tmp1 = tmp1.mul_by_nonresidue(cs);
+        let mut tmp2 = self.c0.mul(cs, &mut c0);
+        let mut tmp1 = tmp1.add(cs, &mut tmp2);
+
+        let mut t = tmp1.inverse(cs);
+        let c0_new = t.mul(cs, &mut c0);
+        let c1_new = t.mul(cs, &mut c1);
+        let c2_new = t.mul(cs, &mut c2);
+
+        Self::new(c0_new, c1_new, c2_new)
+    }
+
+    pub fn div<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let mut inv = other.inverse(cs);
+        self.mul(cs, &mut inv)
+    }
+
+    /// Compute the Frobenius map - raise this element to power.
+    #[allow(unused_variables)]
+    pub fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.frobenius_map(cs, power);
+        let mut c1 = self.c1.frobenius_map(cs, power);
+        let mut c2 = self.c2.frobenius_map(cs, power);
+
+        let c1_frobenius_constant = P::FROBENIUS_COEFFS_C1[power % 6];
+        let c2_frobenius_constant = P::FROBENIUS_COEFFS_C2[power % 6];
+
+        let params = c1.get_params();
+
+        let mut c1_frobenius_coeff = Fq2::constant(cs, c1_frobenius_constant, params);
+        let mut c2_frobenius_coeff = Fq2::constant(cs, c2_frobenius_constant, params);
+
+        let c1 = c1.mul(cs, &mut c1_frobenius_coeff);
+        let c2 = c2.mul(cs, &mut c2_frobenius_coeff);
+
+        Self::new(c0, c1, c2)
+    }
+
+    /// Normalizes the element of `Fq6` by normalizing its components.
+    pub fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.normalize(cs);
+        self.c1.normalize(cs);
+        self.c2.normalize(cs);
+    }
+
+    /// Allocate `Fq6` tower extension element from the Witness represented in three components
+    /// from the `Fq2` tower extension.
+    pub fn constant<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let constants = P::convert_from_structured_witness(wit);
+        let c0 = Fq2::constant(cs, constants[0], params);
+        let c1 = Fq2::constant(cs, constants[1], params);
+        let c2 = Fq2::constant(cs, constants[2], params);
+
+        Self::new(c0, c1, c2)
+    }
+
+    /// Allocate `Fq6` tower extension element from the Witness represented in three components
+    /// from the `Fq2` tower extension.
+    pub fn allocate_from_witness<CS>(cs: &mut CS, wit: P::Witness, params: &Arc<NN::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let components = P::convert_from_structured_witness(wit);
+        let c0 = Fq2::allocate_from_witness(cs, components[0], params);
+        let c1 = Fq2::allocate_from_witness(cs, components[1], params);
+        let c2 = Fq2::allocate_from_witness(cs, components[2], params);
+
+        Self::new(c0, c1, c2)
+    }
+}
+
+impl<F, T, NN, P> CSAllocatable<F> for Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension6Params<T>,
+{
+    type Witness = (
+        <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::Witness,
+        <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::Witness,
+        <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::Witness,
+    );
+
+    #[inline(always)]
+    fn placeholder_witness() -> Self::Witness {
+        (
+            <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::placeholder_witness(),
+            <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::placeholder_witness(),
+            <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::placeholder_witness(),
+        )
+    }
+
+    #[inline(always)]
+    fn allocate_without_value<CS>(cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_without_value(cs);
+        let c1 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_without_value(cs);
+        let c2 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_without_value(cs);
+
+        Self::new(c0, c1, c2)
+    }
+
+    #[inline(always)]
+    fn allocate<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1, c2) = witness;
+
+        let c0 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate(cs, c0);
+        let c1 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate(cs, c1);
+        let c2 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate(cs, c2);
+
+        Self::new(c0, c1, c2)
+    }
+
+    #[inline(always)]
+    fn allocate_constant<CS>(cs: &mut CS, witness: Self::Witness) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let (c0, c1, c2) = witness;
+
+        let c0 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_constant(cs, c0);
+        let c1 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_constant(cs, c1);
+        let c2 = <Fq2<F, T, NN, P::Ex2> as CSAllocatable<F>>::allocate_constant(cs, c2);
+
+        Self::new(c0, c1, c2)
+    }
+}
+
+impl<F, T, NN, P> WitnessHookable<F> for Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension6Params<T>,
+{
+    fn witness_hook<CS>(&self, cs: &CS) -> Box<dyn FnOnce() -> Option<Self::Witness> + 'static>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.witness_hook(cs);
+        let c1 = self.c1.witness_hook(cs);
+        let c2 = self.c2.witness_hook(cs);
+
+        Box::new(move || {
+            let c0 = c0()?;
+            let c1 = c1()?;
+            let c2 = c2()?;
+
+            Some((c0, c1, c2))
+        })
+    }
+}
+
+impl<F, T, NN, P> CSPlaceholder<F> for Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CSPlaceholder<F>,
+    P: Extension6Params<T>,
+{
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let placeholder = <Fq2<F, T, NN, P::Ex2> as CSPlaceholder<F>>::placeholder(cs);
+
+        Self::new(placeholder.clone(), placeholder.clone(), placeholder)
+    }
+}
+
+impl<F, T, NN, P> CircuitVarLengthEncodable<F> for Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T> + CircuitVarLengthEncodable<F>,
+    P: Extension6Params<T>,
+{
+    fn encoding_length(&self) -> usize {
+        self.c0.encoding_length() + self.c1.encoding_length() + self.c1.encoding_length()
+    }
+
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        self.c0.encode_to_buffer(cs, dst);
+        self.c1.encode_to_buffer(cs, dst);
+        self.c2.encode_to_buffer(cs, dst);
+    }
+}
+
+impl<F, T, NN, P> NonNativeField<F, T> for Fq6<F, T, NN, P>
+where
+    F: SmallField,
+    T: PrimeField,
+    NN: NonNativeField<F, T>,
+    P: Extension6Params<T>,
+{
+    type Params = NN::Params;
+
+    fn get_params(&self) -> &Arc<Self::Params> {
+        self.c0.get_params()
+    }
+
+    fn allocated_constant<CS>(cs: &mut CS, value: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocated_constant(cs, value, params);
+        let c0 = Fq2::new(c0, NN::allocated_constant(cs, T::zero(), params));
+        let c1 = Fq2::zero(cs, params);
+        let c2 = Fq2::zero(cs, params);
+
+        Self::new(c0, c1, c2)
+    }
+
+    fn allocate_checked<CS>(cs: &mut CS, witness: T, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = NN::allocate_checked(cs, witness, params);
+        let c0 = Fq2::new(c0, NN::allocated_constant(cs, T::zero(), params));
+        let c1 = Fq2::zero(cs, params);
+        let c2 = Fq2::zero(cs, params);
+
+        Self::new(c0, c1, c2)
+    }
+
+    fn allocate_checked_without_value<CS>(cs: &mut CS, params: &Arc<Self::Params>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = Fq2::allocate_checked_without_value(cs, params);
+        let c1 = Fq2::allocate_checked_without_value(cs, params);
+        let c2 = Fq2::allocate_checked_without_value(cs, params);
+
+        Self::new(c0, c1, c2)
+    }
+
+    fn is_zero<CS>(&mut self, cs: &mut CS) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.is_zero(cs)
+    }
+
+    fn negated<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.negated(cs)
+    }
+
+    fn equals<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let is_c0_equal = self.c0.equals(cs, &mut other.c0);
+        let is_c1_equal = self.c1.equals(cs, &mut other.c1);
+        let is_c2_equal = self.c2.equals(cs, &mut other.c2);
+        Boolean::multi_and(cs, &[is_c0_equal, is_c1_equal, is_c2_equal])
+    }
+
+    fn add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn lazy_add<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.add(cs, other)
+    }
+
+    fn add_many_lazy<CS, const M: usize>(cs: &mut CS, inputs: [&mut Self; M]) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        assert!(M != 0, "add_many_lazy: inputs must not be empty");
+
+        let params = inputs[0].get_params();
+        let mut result = Self::zero(cs, params);
+
+        for i in 0..M {
+            result = result.add(cs, inputs[i]);
+        }
+
+        result
+    }
+
+    fn sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn lazy_sub<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.sub(cs, other)
+    }
+
+    fn double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn lazy_double<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.double(cs)
+    }
+
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.mul(cs, other)
+    }
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.square(cs)
+    }
+
+    fn div_unchecked<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.div(cs, other)
+    }
+
+    #[allow(unused_variables)]
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let c0 = <Fq2<F, T, NN, <P as Extension6Params<T>>::Ex2>>::conditionally_select(
+            cs, flag, &a.c0, &b.c0,
+        );
+        let c1 = <Fq2<F, T, NN, <P as Extension6Params<T>>::Ex2>>::conditionally_select(
+            cs, flag, &a.c1, &b.c1,
+        );
+        let c2 = <Fq2<F, T, NN, <P as Extension6Params<T>>::Ex2>>::conditionally_select(
+            cs, flag, &a.c2, &b.c2,
+        );
+
+        Self::new(c0, c1, c2)
+    }
+
+    #[allow(unused_variables)]
+    fn allocate_inverse_or_zero<CS>(&self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        // TODO: Make check for zero.
+        let mut self_cloned = self.clone();
+        self_cloned.inverse(cs)
+    }
+
+    fn inverse_unchecked<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.inverse(cs)
+    }
+
+    #[allow(unused_variables)]
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.normalize(cs);
+        self.c1.normalize(cs);
+        self.c2.normalize(cs);
+    }
+
+    fn mask<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask(cs, masking_bit);
+        let c1 = self.c1.mask(cs, masking_bit);
+        let c2 = self.c2.mask(cs, masking_bit);
+
+        Self::new(c0, c1, c2)
+    }
+
+    fn mask_negated<CS>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 = self.c0.mask_negated(cs, masking_bit);
+        let c1 = self.c1.mask_negated(cs, masking_bit);
+        let c2 = self.c2.mask_negated(cs, masking_bit);
+
+        Self::new(c0, c1, c2)
+    }
+
+    fn enforce_reduced<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        self.c0.enforce_reduced(cs);
+        self.c1.enforce_reduced(cs);
+        self.c2.enforce_reduced(cs);
+    }
+
+    fn enforce_equal<CS>(cs: &mut CS, a: &Self, b: &Self)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        Fq2::enforce_equal(cs, &a.c0, &b.c0);
+        Fq2::enforce_equal(cs, &a.c1, &b.c1);
+        Fq2::enforce_equal(cs, &a.c2, &b.c2);
+    }
+}
+
+impl<F, NN> Selectable<F> for Fq6<F, BN256Fq, NN, BN256Extension6Params>
+where
+    F: SmallField,
+    NN: NonNativeField<F, BN256Fq>,
+{
+    fn conditionally_select<CS>(cs: &mut CS, flag: Boolean<F>, a: &Self, b: &Self) -> Self
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let c0 =
+            <Fq2<F, BN256Fq, NN, BN256Extension2Params> as Selectable<F>>::conditionally_select(
+                cs, flag, &a.c0, &b.c0,
+            );
+        let c1 =
+            <Fq2<F, BN256Fq, NN, BN256Extension2Params> as Selectable<F>>::conditionally_select(
+                cs, flag, &a.c1, &b.c1,
+            );
+        let c2 =
+            <Fq2<F, BN256Fq, NN, BN256Extension2Params> as Selectable<F>>::conditionally_select(
+                cs, flag, &a.c2, &b.c2,
+            );
+
+        Self::new(c0, c1, c2)
+    }
+}
diff --git a/src/gadgets/tower_extension/mod.rs b/src/gadgets/tower_extension/mod.rs
new file mode 100644
index 0000000..d6a83be
--- /dev/null
+++ b/src/gadgets/tower_extension/mod.rs
@@ -0,0 +1,5 @@
+pub mod algebraic_torus;
+pub mod fq12;
+pub mod fq2;
+pub mod fq6;
+pub mod params;
diff --git a/src/gadgets/tower_extension/params/bn256.rs b/src/gadgets/tower_extension/params/bn256.rs
new file mode 100644
index 0000000..0a40779
--- /dev/null
+++ b/src/gadgets/tower_extension/params/bn256.rs
@@ -0,0 +1,253 @@
+use pairing::bn256::{fq::Fq as BN256Fq, Fq12 as BN256Fq12, Fq2 as BN256Fq2, Fq6 as BN256Fq6};
+
+use super::*;
+use pairing::bn256::fq::{
+    FROBENIUS_COEFF_FQ12_C1 as BN256_FROBENIUS_COEFF_FQ12_C1,
+    FROBENIUS_COEFF_FQ6_C1 as BN256_FROBENIUS_COEFF_FQ6_C1,
+    FROBENIUS_COEFF_FQ6_C2 as BN256_FROBENIUS_COEFF_FQ6_C2,
+};
+
+#[derive(Clone, Debug, Copy)]
+pub struct BN256Extension2Params {}
+impl Extension2Params<BN256Fq> for BN256Extension2Params {
+    type Witness = BN256Fq2;
+
+    fn convert_to_structured_witness(c0: BN256Fq, c1: BN256Fq) -> Self::Witness {
+        BN256Fq2 { c0, c1 }
+    }
+
+    fn convert_from_structured_witness(wit: Self::Witness) -> (BN256Fq, BN256Fq) {
+        (wit.c0, wit.c1)
+    }
+}
+
+#[derive(Clone, Debug, Copy)]
+pub struct BN256Extension6Params {}
+impl Extension6Params<BN256Fq> for BN256Extension6Params {
+    type Ex2 = BN256Extension2Params;
+    type Witness = BN256Fq6;
+
+    const FROBENIUS_COEFFS_C1: [BN256Fq2; 6] = BN256_FROBENIUS_COEFF_FQ6_C1;
+    const FROBENIUS_COEFFS_C2: [BN256Fq2; 6] = BN256_FROBENIUS_COEFF_FQ6_C2;
+
+    fn convert_to_structured_witness(c0: BN256Fq2, c1: BN256Fq2, c2: BN256Fq2) -> Self::Witness {
+        Self::Witness { c0, c1, c2 }
+    }
+
+    fn convert_from_structured_witness(wit: Self::Witness) -> [BN256Fq2; 3] {
+        [wit.c0, wit.c1, wit.c2]
+    }
+}
+
+#[derive(Clone, Debug, Copy)]
+pub struct BN256Extension12Params {}
+impl Extension12Params<BN256Fq> for BN256Extension12Params {
+    type Ex6 = BN256Extension6Params;
+    type Witness = BN256Fq12;
+
+    // These are Fp2 because we will multiply them with c1 `Fp6`, which has underlying `Fp2`.
+    const FROBENIUS_COEFFS_C1:
+        [<<Self::Ex6 as Extension6Params<BN256Fq>>::Ex2 as Extension2Params<BN256Fq>>::Witness; 12] =
+        BN256_FROBENIUS_COEFF_FQ12_C1;
+
+    fn convert_to_structured_witness(c0: BN256Fq6, c1: BN256Fq6) -> Self::Witness {
+        Self::Witness { c0, c1 }
+    }
+
+    fn convert_from_structured_witness(wit: Self::Witness) -> (BN256Fq6, BN256Fq6) {
+        (wit.c0, wit.c1)
+    }
+}
+
+// Constants for torus extension
+const TWO_INVERSE_C0: &str =
+    "10944121435919637611123202872628637544348155578648911831344518947322613104292";
+const W_INVERSE_C5_C0: &str =
+    "21087453498479301738505683583845423561061080261299122796980902361914303298513";
+const W_INVERSE_C5_C1: &str =
+    "14681138511599513868579906292550611339979233093309515871315818100066920017952";
+
+impl BN256Extension12Params {
+    /// Returns the `gamma` element in `Fq6`,
+    /// being simply the element `0+1*v+0*v^2` in `Fq6`.
+    pub(super) fn gamma() -> BN256Fq6 {
+        BN256Fq6 {
+            c0: BN256Fq2::zero(),
+            c1: BN256Fq2::one(),
+            c2: BN256Fq2::zero(),
+        }
+    }
+
+    /// Returns the `0+1*w` element in `Fq12`
+    pub(super) fn w() -> BN256Fq12 {
+        BN256Fq12 {
+            c0: BN256Fq6::zero(),
+            c1: BN256Fq6::one(),
+        }
+    }
+
+    /// Decompresses a torus element from Fq6 to a field element Fq12.
+    ///
+    /// `g -> (g + w) / (g - w)`
+    pub(super) fn decompress_torus(g: BN256Fq6) -> BN256Fq12 {
+        let mut one = BN256Fq6::one();
+        let mut result = BN256Fq12 {
+            c0: g,
+            c1: one.clone(),
+        };
+        one.negate();
+        let denominator = BN256Fq12 { c0: g, c1: one };
+        let denominator_inverse = denominator.inverse().unwrap();
+        result.mul_assign(&denominator_inverse);
+
+        result
+    }
+
+    /// Compresses a field element from Fq12 to torus Fq6.
+    ///
+    /// `m -> (1 + m0) / m1, m = m0 + m1*w`
+    pub(super) fn compress_torus(m: BN256Fq12) -> BN256Fq6 {
+        let mut result = m.c0.clone();
+        result.add_assign(&BN256Fq6::one());
+
+        let inverse_denominator = m.c1.inverse().unwrap();
+        result.mul_assign(&inverse_denominator);
+
+        result
+    }
+}
+
+impl TorusExtension12Params<BN256Fq> for BN256Extension12Params {
+    fn get_two_inverse_coeffs_c0() -> BN256Fq {
+        BN256Fq::from_str(TWO_INVERSE_C0).unwrap()
+    }
+
+    fn get_w_inverse_coeffs_c5() -> BN256Fq2 {
+        BN256Fq2 {
+            c0: BN256Fq::from_str(W_INVERSE_C5_C0).unwrap(),
+            c1: BN256Fq::from_str(W_INVERSE_C5_C1).unwrap(),
+        }
+    }
+
+    /// Native computation of torus squaring on encoding in Fq6.
+    ///
+    /// `g' = 1/2 (g + \gamma / g)`
+    fn torus_square(g: BN256Fq6) -> BN256Fq6 {
+        let gamma = Self::gamma();
+
+        let result = if g.is_zero() {
+            BN256Fq6::zero()
+        } else {
+            // Decompress g
+            let mut decompressed = Self::decompress_torus(g);
+            // Now that we are in fq12, square
+            decompressed.square();
+            // Now, compress g back onto the torus so we can use it
+            Self::compress_torus(decompressed)
+        };
+
+        // Constraint check
+        // (2g' - g) * g = \gamma
+        let mut lhs = result.clone();
+
+        lhs.double();
+        lhs.sub_assign(&g);
+        lhs.mul_assign(&g);
+
+        let rhs = gamma.clone();
+
+        if !g.is_zero() {
+            assert_eq!(lhs, rhs, "witness lhs == rhs");
+        } else {
+            assert_eq!(lhs, BN256Fq6::zero(), "g is zero, witness lhs == rhs");
+        }
+
+        result
+    }
+
+    /// Native computation of torus multiplication on encoding in Fq6.
+    ///
+    /// `(g, g') -> (g * g' + \gamma) / (g + g')`
+    fn torus_mul(
+        g1: <Self::Ex6 as Extension6Params<BN256Fq>>::Witness,
+        g2: <Self::Ex6 as Extension6Params<BN256Fq>>::Witness,
+    ) -> <Self::Ex6 as Extension6Params<BN256Fq>>::Witness {
+        let gamma = Self::gamma();
+
+        let mut g1_add_g2 = g1.clone();
+        g1_add_g2.add_assign(&g2);
+
+        let result = if g1_add_g2.is_zero() {
+            BN256Fq6::zero()
+        } else {
+            // Decompress g1
+            let decompressed_g1 = Self::decompress_torus(g1);
+            // Decompress g2
+            let decompressed_g2 = Self::decompress_torus(g2);
+            // Multiply
+            let mut decompressed_g1_times_g2 = decompressed_g1.clone();
+            decompressed_g1_times_g2.mul_assign(&decompressed_g2);
+            // Compress the result
+            Self::compress_torus(decompressed_g1_times_g2)
+        };
+
+        // Since we have g12 = (g1*g2 + \gamma) / (g1+g2), we can
+        // constraint require:
+        // g12 * (g1 + g2) == g1 * g2 + \gamma
+
+        let mut lhs = result.clone();
+        lhs.mul_assign(&g1_add_g2);
+
+        let mut g1_times_g2 = g1.clone();
+        g1_times_g2.mul_assign(&g2);
+        let mut rhs = g1_times_g2.clone();
+        rhs.add_assign(&gamma);
+
+        if g1_add_g2.is_zero() {
+            assert_eq!(lhs, BN256Fq6::zero(), "g1 + g2 is zero, witness lhs == rhs");
+        } else {
+            assert_eq!(lhs, rhs, "witness lhs == rhs");
+        }
+
+        result
+    }
+
+    /// Native computation of frobenius map
+    /// 
+    /// `(g,i) -> f(g,i) / (f(w,i) * w^{-1})` where `f(g,i) = g^{q^{i}}`
+    fn torus_frobenius_map(
+            g: <Self::Ex6 as Extension6Params<BN256Fq>>::Witness,
+            power: usize,
+        ) -> <Self::Ex6 as Extension6Params<BN256Fq>>::Witness {
+        let mut result = Self::decompress_torus(g);
+        result.frobenius_map(power);
+        let result = Self::compress_torus(result);
+
+        // Now, we need to check the constraint. Namely, suppose
+        // r is our result. Then,
+        // w * f(g, i) = f(w, i) * r
+
+        // lhs = f(g, i) * w
+        let w = Self::w();
+        let mut lhs = g.clone();
+        lhs.frobenius_map(power);
+        let mut lhs = BN256Fq12{
+            c0: lhs,
+            c1: BN256Fq6::zero(),
+        };
+        lhs.mul_assign(&w);
+
+        // rhs = f(w, i) * r
+        let mut rhs = Self::w();
+        rhs.frobenius_map(power);
+        let r = BN256Fq12{
+            c0: result,
+            c1: BN256Fq6::zero(),
+        };
+        rhs.mul_assign(&r);
+
+        assert_eq!(lhs, rhs, "witness lhs == rhs");
+
+        result
+    }
+}
diff --git a/src/gadgets/tower_extension/params/mod.rs b/src/gadgets/tower_extension/params/mod.rs
new file mode 100644
index 0000000..d820c6e
--- /dev/null
+++ b/src/gadgets/tower_extension/params/mod.rs
@@ -0,0 +1,95 @@
+use pairing::ff::{Field, PrimeField};
+
+use std::fmt::Debug;
+
+pub mod bn256;
+
+// We don't have generic unconstrained tower extensions element, so we resolve it using following.
+// Besides, one may include here field-specific characteristics, such as non-residue for example,
+// and branch out implementations with the help of it.
+
+pub trait Extension2Params<P: PrimeField>: 'static + Clone + Copy + Send + Sync + Debug {
+    /// Witness here represents field element not under CS.
+    type Witness: Field;
+
+    fn convert_to_structured_witness(c0: P, c1: P) -> Self::Witness;
+    fn convert_from_structured_witness(val: Self::Witness) -> (P, P);
+}
+
+pub trait Extension6Params<P: PrimeField>: 'static + Clone + Copy + Send + Sync + Debug {
+    type Ex2: Extension2Params<P>;
+    /// Witness here represents field element not under CS.
+    type Witness: Field;
+
+    const FROBENIUS_COEFFS_C1: [<Self::Ex2 as Extension2Params<P>>::Witness; 6];
+    const FROBENIUS_COEFFS_C2: [<Self::Ex2 as Extension2Params<P>>::Witness; 6];
+
+    fn convert_to_structured_witness(
+        c0: <Self::Ex2 as Extension2Params<P>>::Witness,
+        c1: <Self::Ex2 as Extension2Params<P>>::Witness,
+        c2: <Self::Ex2 as Extension2Params<P>>::Witness,
+    ) -> Self::Witness;
+    fn convert_from_structured_witness(
+        wit: Self::Witness,
+    ) -> [<Self::Ex2 as Extension2Params<P>>::Witness; 3];
+}
+
+pub trait Extension12Params<P: PrimeField>: 'static + Clone + Copy + Send + Sync + Debug {
+    type Ex6: Extension6Params<P>;
+    /// Witness here represents field element not under CS.
+    type Witness: Field;
+
+    const FROBENIUS_COEFFS_C1: [<<Self::Ex6 as Extension6Params<P>>::Ex2 as Extension2Params<
+        P,
+    >>::Witness; 12];
+
+    fn convert_to_structured_witness(
+        c0: <Self::Ex6 as Extension6Params<P>>::Witness,
+        c1: <Self::Ex6 as Extension6Params<P>>::Witness,
+    ) -> Self::Witness;
+
+    fn convert_from_structured_witness(
+        wit: Self::Witness,
+    ) -> (
+        <Self::Ex6 as Extension6Params<P>>::Witness,
+        <Self::Ex6 as Extension6Params<P>>::Witness,
+    );
+}
+
+pub trait TorusExtension12Params<T>:
+    'static + Clone + Copy + Send + Sync + Debug + Extension12Params<T>
+where
+    T: PrimeField,
+{
+    // NOTE: Here, we use selectors instead of constants as BN256Fq2 does not allow to allocate constant without accessing a private field.
+    // TODO: Not sure whether w^{-1} is just c5*v^2*w in a general Fq12 extension, but this is the case for BN254.
+    /// Assuming `w^{-1} = c5*v^2*w`, returns the coefficient `c5`.
+    fn get_w_inverse_coeffs_c5(
+    ) -> <<Self::Ex6 as Extension6Params<T>>::Ex2 as Extension2Params<T>>::Witness;
+
+    /// Returns the constant c0 = 1/2
+    fn get_two_inverse_coeffs_c0() -> T;
+
+    /// Computes the square of a Torus element using the formula
+    ///
+    /// `g' -> 1/2 * (g - gamma/g)`
+    fn torus_square(
+        g: <Self::Ex6 as Extension6Params<T>>::Witness,
+    ) -> <Self::Ex6 as Extension6Params<T>>::Witness;
+
+    /// Computes the product of two Torus elements using the formula
+    ///
+    /// `(g, g') -> (g * g' + \gamma) / (g + g')`
+    fn torus_mul(
+        g1: <Self::Ex6 as Extension6Params<T>>::Witness,
+        g2: <Self::Ex6 as Extension6Params<T>>::Witness,
+    ) -> <Self::Ex6 as Extension6Params<T>>::Witness;
+
+    /// Computes the Frobenius map of a Torus element
+    /// 
+    /// `(g,i) -> w*f(g,i) / f(w,i)` where `f(g,i) = g^{q^{i}}`
+    fn torus_frobenius_map(
+        g: <Self::Ex6 as Extension6Params<T>>::Witness,
+        power: usize,
+    ) -> <Self::Ex6 as Extension6Params<T>>::Witness;
+}
diff --git a/src/gadgets/traits/hardexp_compatible.rs b/src/gadgets/traits/hardexp_compatible.rs
new file mode 100644
index 0000000..3d2ae43
--- /dev/null
+++ b/src/gadgets/traits/hardexp_compatible.rs
@@ -0,0 +1,38 @@
+use crate::cs::traits::cs::ConstraintSystem;
+
+use super::SmallField;
+
+/// This trait is used to define the requirements for an element to be compatible
+/// with the hard exponentiation step
+pub trait HardexpCompatible<F>: Clone
+where
+    F: SmallField,
+{
+    fn mul<CS>(&mut self, cs: &mut CS, other: &mut Self) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn square<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn conjugate<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn inverse<CS>(&mut self, cs: &mut CS) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn frobenius_map<CS>(&mut self, cs: &mut CS, power: usize) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn pow_u32<CS, S: AsRef<[u64]>>(&mut self, cs: &mut CS, exponent: S) -> Self
+    where
+        CS: ConstraintSystem<F>;
+
+    fn normalize<CS>(&mut self, cs: &mut CS)
+    where
+        CS: ConstraintSystem<F>;
+}
diff --git a/src/gadgets/traits/mod.rs b/src/gadgets/traits/mod.rs
index 8e79402..d1c4ac3 100644
--- a/src/gadgets/traits/mod.rs
+++ b/src/gadgets/traits/mod.rs
@@ -5,6 +5,7 @@ pub mod auxiliary;
 pub mod castable;
 pub mod configuration;
 pub mod encodable;
+pub mod hardexp_compatible;
 pub mod round_function;
 pub mod selectable;
 pub mod witnessable;
diff --git a/src/gadgets/u1024/mod.rs b/src/gadgets/u1024/mod.rs
new file mode 100644
index 0000000..a4c66df
--- /dev/null
+++ b/src/gadgets/u1024/mod.rs
@@ -0,0 +1,395 @@
+use super::*;
+use crate::cs::traits::cs::ConstraintSystem;
+use crate::cs::traits::cs::DstBuffer;
+use crate::field::SmallField;
+use crate::gadgets::boolean::Boolean;
+use crate::gadgets::traits::allocatable::CSAllocatable;
+use crate::gadgets::traits::allocatable::CSAllocatableExt;
+use crate::gadgets::traits::witnessable::CSWitnessable;
+use crate::gadgets::traits::witnessable::WitnessHookable;
+use crate::gadgets::u32::UInt32;
+use crate::gadgets::u8::UInt8;
+use ethereum_types::U512;
+use u512::UInt512;
+
+use crate::config::*;
+
+#[derive(Derivative)]
+#[derivative(Clone, Copy, Debug, Hash)]
+pub struct UInt1024<F: SmallField> {
+    pub inner: [UInt32<F>; 32],
+}
+
+pub fn decompose_u1024_as_u32x32(value: (U512, U512)) -> [u32; 32] {
+    let mut result = [0u32; 32];
+    // Filling the low limb
+    for i in 0..8 {
+        result[i * 2] = value.0 .0[i] as u32;
+        result[i * 2 + 1] = (value.0 .0[i] >> 32) as u32;
+    }
+    // Filling the high limb
+    for i in 0..8 {
+        result[i * 2 + 16] = value.1 .0[i] as u32;
+        result[i * 2 + 1 + 16] = (value.1 .0[i] >> 32) as u32;
+    }
+
+    result
+}
+
+pub fn recompose_u1024_as_u32x32(value: [u32; 32]) -> (U512, U512) {
+    // Filling the low limb
+    let mut low = U512::zero();
+    for i in 0..8 {
+        low.0[i] = (value[i * 2] as u64) | ((value[i * 2 + 1] as u64) << 32);
+    }
+
+    // Filling the high limb
+    let mut high = U512::zero();
+    for i in 0..8 {
+        high.0[i] = (value[i * 2 + 16] as u64) | ((value[i * 2 + 1 + 16] as u64) << 32);
+    }
+
+    (low, high)
+}
+
+impl<F: SmallField> CSAllocatable<F> for UInt1024<F> {
+    type Witness = (U512, U512);
+    fn placeholder_witness() -> Self::Witness {
+        (U512::zero(), U512::zero())
+    }
+
+    #[inline(always)]
+    #[must_use]
+    fn allocate_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let vars = cs.alloc_multiple_variables_without_values::<32>();
+
+        let as_u32 = vars.map(|el| UInt32::from_variable_checked(cs, el));
+
+        Self { inner: as_u32 }
+    }
+
+    #[must_use]
+    fn allocate<CS: ConstraintSystem<F>>(cs: &mut CS, witness: Self::Witness) -> Self {
+        let chunks = decompose_u1024_as_u32x32(witness);
+        let chunks = chunks.map(|el| UInt32::allocate_checked(cs, el));
+        Self { inner: chunks }
+    }
+}
+
+impl<F: SmallField> CSAllocatableExt<F> for UInt1024<F> {
+    const INTERNAL_STRUCT_LEN: usize = 32;
+
+    fn witness_from_set_of_values(values: [F; Self::INTERNAL_STRUCT_LEN]) -> Self::Witness {
+        // value
+        recompose_u1024_as_u32x32(
+            values.map(|el| <u32 as WitnessCastable<F, F>>::cast_from_source(el)),
+        )
+    }
+
+    // we should be able to allocate without knowing values yet
+    fn create_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocate_without_value(cs)
+    }
+
+    fn flatten_as_variables(&self) -> [Variable; Self::INTERNAL_STRUCT_LEN]
+    where
+        [(); Self::INTERNAL_STRUCT_LEN]:,
+    {
+        self.inner.map(|el| el.get_variable())
+    }
+
+    fn set_internal_variables_values(witness: Self::Witness, dst: &mut DstBuffer<'_, '_, F>) {
+        decompose_u1024_as_u32x32(witness).map(|el| UInt32::set_internal_variables_values(el, dst));
+    }
+}
+
+use crate::gadgets::traits::selectable::Selectable;
+
+impl<F: SmallField> Selectable<F> for UInt1024<F> {
+    #[must_use]
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let inner = Selectable::conditionally_select(cs, flag, &a.inner, &b.inner);
+
+        Self { inner }
+    }
+}
+
+impl<F: SmallField> UInt1024<F> {
+    #[must_use]
+    pub fn allocated_constant<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        constant: (U512, U512),
+    ) -> Self {
+        debug_assert!(F::CAPACITY_BITS >= 32);
+
+        let chunks = decompose_u1024_as_u32x32(constant);
+        let chunks = chunks.map(|el| UInt32::allocated_constant(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn allocate_from_closure_and_dependencies<
+        CS: ConstraintSystem<F>,
+        FN: FnOnce(&[F]) -> (U512, U512) + 'static + Send + Sync,
+    >(
+        cs: &mut CS,
+        witness_closure: FN,
+        dependencies: &[Place],
+    ) -> Self {
+        let outputs = cs.alloc_multiple_variables_without_values::<32>();
+
+        if <CS::Config as CSConfig>::WitnessConfig::EVALUATE_WITNESS {
+            let value_fn = move |inputs: &[F], output_buffer: &mut DstBuffer<'_, '_, F>| {
+                debug_assert!(F::CAPACITY_BITS >= 32);
+                let witness = (witness_closure)(inputs);
+                let chunks = decompose_u1024_as_u32x32(witness);
+
+                output_buffer.extend(chunks.map(|el| F::from_u64_unchecked(el as u64)));
+            };
+
+            cs.set_values_with_dependencies_vararg(
+                dependencies,
+                &Place::from_variables(outputs),
+                value_fn,
+            );
+        }
+
+        let chunks = outputs.map(|el| UInt32::from_variable_checked(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn zero<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocated_constant(cs, (U512::zero(), U512::zero()))
+    }
+
+    #[must_use]
+    pub fn overflowing_add<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut carry_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, carry) = (*a).overflowing_add_with_carry_in(cs, *b, carry_out);
+            *dst = c;
+            carry_out = carry;
+        }
+
+        (result, carry_out)
+    }
+
+    #[must_use]
+    pub fn overflowing_sub<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut borrow_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, borrow) = (*a).overflowing_sub_with_borrow_in(cs, *b, borrow_out);
+            *dst = c;
+            borrow_out = borrow;
+        }
+
+        (result, borrow_out)
+    }
+
+    /// Multiplies a number by 2^{32}. Panics if the number overflows.
+    #[must_use]
+    pub fn must_mul_by_two_pow_32<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        let boolean_true = Boolean::allocated_constant(cs, true);
+        let last_limb_zero = self.inner[31].is_zero(cs);
+        Boolean::enforce_equal(cs, &last_limb_zero, &boolean_true);
+
+        let mut new_inner = self.inner;
+        new_inner.copy_within(0..31, 1);
+        new_inner[0] = UInt32::zero(cs);
+
+        Self { inner: new_inner }
+    }
+
+    // Returns the value unchanges if `bit` is `true`, and 0 otherwise
+    #[must_use]
+    pub fn mask<CS: ConstraintSystem<F>>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self {
+        let new_inner = self.inner.map(|el| el.mask(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    // Returns the value unchanges if `bit` is `false`, and 0 otherwise
+    #[must_use]
+    pub fn mask_negated<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        masking_bit: Boolean<F>,
+    ) -> Self {
+        let new_inner = self.inner.map(|el| el.mask_negated(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    #[must_use]
+    pub fn equals<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self) -> Boolean<F> {
+        let equals: [_; 32] =
+            std::array::from_fn(|idx| UInt32::equals(cs, &a.inner[idx], &b.inner[idx]));
+
+        Boolean::multi_and(cs, &equals)
+    }
+
+    #[must_use]
+    pub fn from_le_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 128]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 32];
+        for (dst, src) in inner.iter_mut().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_le_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn from_limbs(limbs: [UInt32<F>; 32]) -> Self {
+        Self { inner: limbs }
+    }
+
+    #[must_use]
+    pub fn from_be_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 128]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 32];
+        for (dst, src) in inner.iter_mut().rev().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_be_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn is_zero<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Boolean<F> {
+        let limbs_are_zero = self.inner.map(|el| el.is_zero(cs));
+        Boolean::multi_and(cs, &limbs_are_zero)
+    }
+
+    #[must_use]
+    pub fn to_le_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 128] {
+        let mut encoding = [std::mem::MaybeUninit::uninit(); 128];
+        for (dst, src) in encoding
+            .iter_mut()
+            .zip(self.inner.iter().flat_map(|el| el.to_le_bytes(cs)))
+        {
+            dst.write(src);
+        }
+
+        unsafe { encoding.map(|el| el.assume_init()) }
+    }
+
+    #[must_use]
+    pub fn to_be_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 128] {
+        let mut bytes = self.to_le_bytes(cs);
+        bytes.reverse();
+
+        bytes
+    }
+
+    #[must_use]
+    pub fn to_low(self) -> UInt512<F> {
+        UInt512 {
+            inner: self.inner[..16].try_into().expect("incorrect slice size"),
+        }
+    }
+
+    #[must_use]
+    pub fn to_high(self) -> UInt512<F> {
+        UInt512 {
+            inner: self.inner[16..].try_into().expect("incorrect slice size"),
+        }
+    }
+}
+
+use crate::cs::Variable;
+use crate::gadgets::traits::castable::Convertor;
+use crate::gadgets::traits::castable::WitnessCastable;
+
+impl<F: SmallField> WitnessCastable<F, [F; 32]> for (U512, U512) {
+    #[inline]
+    fn cast_from_source(witness: [F; 32]) -> Self {
+        let reduced = witness.map(|el| {
+            let el = el.as_u64_reduced();
+            debug_assert!(el <= u32::MAX as u64);
+
+            el as u32
+        });
+
+        recompose_u1024_as_u32x32(reduced)
+    }
+
+    #[inline]
+    fn cast_into_source(self) -> [F; 32] {
+        let limbs = decompose_u1024_as_u32x32(self);
+        limbs.map(|el| WitnessCastable::cast_into_source(el))
+    }
+}
+
+impl<F: SmallField> CSWitnessable<F, 32> for UInt1024<F> {
+    type ConversionFunction = Convertor<F, [F; 32], (U512, U512)>;
+
+    fn witness_from_set_of_values(values: [F; 32]) -> Self::Witness {
+        WitnessCastable::cast_from_source(values)
+    }
+
+    fn as_variables_set(&self) -> [Variable; 32] {
+        self.inner.map(|el| el.get_variable())
+    }
+}
+
+impl<F: SmallField> WitnessHookable<F> for UInt1024<F> {
+    fn witness_hook<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &CS,
+    ) -> Box<dyn FnOnce() -> Option<Self::Witness>> {
+        let raw_witness = self.get_witness(cs);
+        Box::new(move || raw_witness.wait())
+    }
+}
+
+use crate::gadgets::traits::selectable::MultiSelectable;
+// multiselect doesn't make much sense here because we can do parallel over chunks,
+// so we degrade to default impl via normal select
+impl<F: SmallField> MultiSelectable<F> for UInt1024<F> {}
+
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+
+impl<F: SmallField> CircuitVarLengthEncodable<F> for UInt1024<F> {
+    #[inline(always)]
+    fn encoding_length(&self) -> usize {
+        32
+    }
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        CircuitVarLengthEncodable::<F>::encode_to_buffer(&self.inner, cs, dst);
+    }
+}
+
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+
+impl<F: SmallField> CSPlaceholder<F> for UInt1024<F> {
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::zero(cs)
+    }
+}
diff --git a/src/gadgets/u2048/mod.rs b/src/gadgets/u2048/mod.rs
new file mode 100644
index 0000000..e47fe5a
--- /dev/null
+++ b/src/gadgets/u2048/mod.rs
@@ -0,0 +1,485 @@
+use super::*;
+use crate::cs::traits::cs::ConstraintSystem;
+use crate::cs::traits::cs::DstBuffer;
+use crate::field::SmallField;
+use crate::gadgets::boolean::Boolean;
+use crate::gadgets::traits::allocatable::CSAllocatable;
+use crate::gadgets::traits::allocatable::CSAllocatableExt;
+use crate::gadgets::traits::witnessable::CSWitnessable;
+use crate::gadgets::traits::witnessable::WitnessHookable;
+use crate::gadgets::u32::UInt32;
+use crate::gadgets::u8::UInt8;
+use blake2s::mixing_function::merge_byte_using_table;
+use crypto_bigint::U1024;
+use tables::ByteSplitTable;
+use u1024::UInt1024;
+use u4096::UInt4096;
+
+use crate::config::*;
+
+#[derive(Derivative)]
+#[derivative(Clone, Copy, Debug, Hash)]
+pub struct UInt2048<F: SmallField> {
+    pub inner: [UInt32<F>; 64],
+}
+
+pub fn decompose_u2048_as_u32x64(value: (U1024, U1024)) -> [u32; 64] {
+    let low_limbs = value.0.as_limbs();
+    let high_limbs = value.1.as_limbs();
+
+    let mut result = [0u32; 64];
+    // Filling the low limb
+    for i in 0..16 {
+        result[i * 2] = low_limbs[i].0 as u32;
+        result[i * 2 + 1] = (low_limbs[i].0 >> 32) as u32;
+    }
+    // Filling the high limb
+    for i in 0..16 {
+        result[i * 2 + 32] = high_limbs[i].0 as u32;
+        result[i * 2 + 1 + 32] = (high_limbs[i].0 >> 32) as u32;
+    }
+
+    result
+}
+
+pub fn recompose_u2048_as_u32x64(value: [u32; 64]) -> (U1024, U1024) {
+    // Filling the low limb
+    let mut low = [0u64; 16];
+    for i in 0..16 {
+        low[i] = (value[i * 2] as u64) | ((value[i * 2 + 1] as u64) << 32);
+    }
+
+    // Filling the high limb
+    let mut high = [0u64; 16];
+    for i in 0..16 {
+        high[i] = (value[i * 2 + 32] as u64) | ((value[i * 2 + 1 + 32] as u64) << 32);
+    }
+
+    (U1024::from_words(low), U1024::from_words(high))
+}
+
+impl<F: SmallField> CSAllocatable<F> for UInt2048<F> {
+    type Witness = (U1024, U1024);
+    fn placeholder_witness() -> Self::Witness {
+        (U1024::ZERO, U1024::ZERO)
+    }
+
+    #[inline(always)]
+    #[must_use]
+    fn allocate_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let vars = cs.alloc_multiple_variables_without_values::<64>();
+
+        let as_u32 = vars.map(|el| UInt32::from_variable_checked(cs, el));
+
+        Self { inner: as_u32 }
+    }
+
+    #[must_use]
+    fn allocate<CS: ConstraintSystem<F>>(cs: &mut CS, witness: Self::Witness) -> Self {
+        let chunks = decompose_u2048_as_u32x64(witness);
+        let chunks = chunks.map(|el| UInt32::allocate_checked(cs, el));
+        Self { inner: chunks }
+    }
+}
+
+impl<F: SmallField> CSAllocatableExt<F> for UInt2048<F> {
+    const INTERNAL_STRUCT_LEN: usize = 64;
+
+    fn witness_from_set_of_values(values: [F; Self::INTERNAL_STRUCT_LEN]) -> Self::Witness {
+        // value
+        recompose_u2048_as_u32x64(
+            values.map(|el| <u32 as WitnessCastable<F, F>>::cast_from_source(el)),
+        )
+    }
+
+    // we should be able to allocate without knowing values yet
+    fn create_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocate_without_value(cs)
+    }
+
+    fn flatten_as_variables(&self) -> [Variable; Self::INTERNAL_STRUCT_LEN]
+    where
+        [(); Self::INTERNAL_STRUCT_LEN]:,
+    {
+        self.inner.map(|el| el.get_variable())
+    }
+
+    fn set_internal_variables_values(witness: Self::Witness, dst: &mut DstBuffer<'_, '_, F>) {
+        decompose_u2048_as_u32x64(witness).map(|el| UInt32::set_internal_variables_values(el, dst));
+    }
+}
+
+use crate::gadgets::traits::selectable::Selectable;
+
+impl<F: SmallField> Selectable<F> for UInt2048<F> {
+    #[must_use]
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let inner = Selectable::conditionally_select(cs, flag, &a.inner, &b.inner);
+
+        Self { inner }
+    }
+}
+
+impl<F: SmallField> UInt2048<F> {
+    #[must_use]
+    pub fn allocated_constant<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        constant: (U1024, U1024),
+    ) -> Self {
+        debug_assert!(F::CAPACITY_BITS >= 32);
+
+        let chunks = decompose_u2048_as_u32x64(constant);
+        let chunks = chunks.map(|el| UInt32::allocated_constant(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn allocate_from_closure_and_dependencies<
+        CS: ConstraintSystem<F>,
+        FN: FnOnce(&[F]) -> (U1024, U1024) + 'static + Send + Sync,
+    >(
+        cs: &mut CS,
+        witness_closure: FN,
+        dependencies: &[Place],
+    ) -> Self {
+        let outputs = cs.alloc_multiple_variables_without_values::<64>();
+
+        if <CS::Config as CSConfig>::WitnessConfig::EVALUATE_WITNESS {
+            let value_fn = move |inputs: &[F], output_buffer: &mut DstBuffer<'_, '_, F>| {
+                debug_assert!(F::CAPACITY_BITS >= 64);
+                let witness = (witness_closure)(inputs);
+                let chunks = decompose_u2048_as_u32x64(witness);
+
+                output_buffer.extend(chunks.map(|el| F::from_u64_unchecked(el as u64)));
+            };
+
+            cs.set_values_with_dependencies_vararg(
+                dependencies,
+                &Place::from_variables(outputs),
+                value_fn,
+            );
+        }
+
+        let chunks = outputs.map(|el| UInt32::from_variable_checked(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn zero<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocated_constant(cs, (U1024::ZERO, U1024::ZERO))
+    }
+
+    #[must_use]
+    pub fn overflowing_add<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut carry_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, carry) = (*a).overflowing_add_with_carry_in(cs, *b, carry_out);
+            *dst = c;
+            carry_out = carry;
+        }
+
+        (result, carry_out)
+    }
+
+    #[must_use]
+    pub fn overflowing_sub<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut borrow_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, borrow) = (*a).overflowing_sub_with_borrow_in(cs, *b, borrow_out);
+            *dst = c;
+            borrow_out = borrow;
+        }
+
+        (result, borrow_out)
+    }
+
+    #[must_use]
+    pub fn widening_mul<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+        self_limbs: usize,
+        other_limbs: usize,
+    ) -> UInt4096<F> {
+        assert!(
+            self_limbs + other_limbs <= 128,
+            "total number of limbs must be <= 128"
+        );
+
+        let zero = UInt32::allocated_constant(cs, 0);
+        let mut remainders = vec![UInt32::<F>::zero(cs); self_limbs + other_limbs];
+
+        for i in 0..self_limbs {
+            let mut carry = UInt32::allocated_constant(cs, 0);
+            for j in 0..other_limbs {
+                let res = UInt32::fma_with_carry(
+                    cs,
+                    self.inner[i],
+                    other.inner[j],
+                    if i == 0 { zero } else { remainders[i + j] },
+                    carry,
+                );
+                (remainders[i + j], carry) = (res[0].0, res[1].0);
+            }
+            remainders[i + other_limbs] = carry;
+        }
+
+        let mut inner = [UInt32::<F>::zero(cs); 128];
+        inner[..self_limbs + other_limbs].copy_from_slice(&remainders);
+        UInt4096 { inner }
+    }
+
+    /// Multiplies a number by 2^{32}. Panics if the number overflows.
+    #[must_use]
+    pub fn must_mul_by_two_pow_32<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        let boolean_true = Boolean::allocated_constant(cs, true);
+        let last_limb_zero = self.inner[63].is_zero(cs);
+        Boolean::enforce_equal(cs, &last_limb_zero, &boolean_true);
+
+        let mut new_inner = self.inner;
+        new_inner.copy_within(0..65, 1);
+        new_inner[0] = UInt32::zero(cs);
+
+        Self { inner: new_inner }
+    }
+
+    #[must_use]
+    pub fn div2<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        let byte_split_id = cs
+            .get_table_id_for_marker::<ByteSplitTable<1>>()
+            .expect("table should exist");
+        let mut bytes = self.to_le_bytes(cs);
+        let mut bit: Option<Variable> = None;
+        bytes.iter_mut().rev().for_each(|b| {
+            let res = cs.perform_lookup::<1, 2>(byte_split_id, &[b.get_variable()]);
+            let mut shifted = res[1];
+            let new_bit = res[0];
+            if let Some(top_bit) = bit {
+                shifted = merge_byte_using_table::<_, _, 7>(cs, shifted, top_bit);
+            }
+            *b = UInt8 {
+                variable: shifted,
+                _marker: std::marker::PhantomData,
+            };
+            bit = Some(new_bit);
+        });
+        Self::from_le_bytes(cs, bytes)
+    }
+
+    /// Finds the result of multiplying `self` by `other` mod `modulo`.
+    pub fn modmul<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &UInt2048<F>,
+        modulo: &UInt2048<F>,
+    ) -> UInt2048<F> {
+        // We take 8 limbs since scalar can be of any size
+        let product = self.widening_mul(cs, other, 64, 64);
+        let (_, remainder) = product.long_division(cs, modulo);
+        remainder
+    }
+
+    #[must_use]
+    pub fn is_odd<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Boolean<F> {
+        self.inner[0].into_num().spread_into_bits::<CS, 32>(cs)[0]
+    }
+
+    // Returns the value unchanges if `bit` is `true`, and 0 otherwise
+    #[must_use]
+    pub fn mask<CS: ConstraintSystem<F>>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self {
+        let new_inner = self.inner.map(|el| el.mask(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    // Returns the value unchanges if `bit` is `false`, and 0 otherwise
+    #[must_use]
+    pub fn mask_negated<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        masking_bit: Boolean<F>,
+    ) -> Self {
+        let new_inner = self.inner.map(|el| el.mask_negated(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    #[must_use]
+    pub fn to_u4096<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> UInt4096<F> {
+        let mut u4096: UInt4096<F> = UInt4096::zero(cs);
+        u4096.inner[..64].copy_from_slice(&self.inner);
+        u4096
+    }
+
+    #[must_use]
+    pub fn equals<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self) -> Boolean<F> {
+        let equals: [_; 64] =
+            std::array::from_fn(|idx| UInt32::equals(cs, &a.inner[idx], &b.inner[idx]));
+
+        Boolean::multi_and(cs, &equals)
+    }
+
+    #[must_use]
+    pub fn from_le_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 256]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 64];
+        for (dst, src) in inner.iter_mut().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_le_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn from_limbs(limbs: [UInt32<F>; 64]) -> Self {
+        Self { inner: limbs }
+    }
+
+    #[must_use]
+    pub fn from_be_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 256]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 64];
+        for (dst, src) in inner.iter_mut().rev().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_be_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn is_zero<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Boolean<F> {
+        let limbs_are_zero = self.inner.map(|el| el.is_zero(cs));
+        Boolean::multi_and(cs, &limbs_are_zero)
+    }
+
+    #[must_use]
+    pub fn to_le_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 256] {
+        let mut encoding = [std::mem::MaybeUninit::uninit(); 256];
+        for (dst, src) in encoding
+            .iter_mut()
+            .zip(self.inner.iter().flat_map(|el| el.to_le_bytes(cs)))
+        {
+            dst.write(src);
+        }
+
+        unsafe { encoding.map(|el| el.assume_init()) }
+    }
+
+    #[must_use]
+    pub fn to_be_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 256] {
+        let mut bytes = self.to_le_bytes(cs);
+        bytes.reverse();
+
+        bytes
+    }
+
+    #[must_use]
+    pub fn to_low(self) -> UInt1024<F> {
+        UInt1024 {
+            inner: self.inner[..32].try_into().expect("incorrect slice size"),
+        }
+    }
+
+    #[must_use]
+    pub fn to_high(self) -> UInt1024<F> {
+        UInt1024 {
+            inner: self.inner[32..].try_into().expect("incorrect slice size"),
+        }
+    }
+}
+
+use crate::cs::Variable;
+use crate::gadgets::traits::castable::Convertor;
+use crate::gadgets::traits::castable::WitnessCastable;
+
+impl<F: SmallField> WitnessCastable<F, [F; 64]> for (U1024, U1024) {
+    #[inline]
+    fn cast_from_source(witness: [F; 64]) -> Self {
+        let reduced = witness.map(|el| {
+            let el = el.as_u64_reduced();
+            debug_assert!(el <= u32::MAX as u64);
+
+            el as u32
+        });
+
+        recompose_u2048_as_u32x64(reduced)
+    }
+
+    #[inline]
+    fn cast_into_source(self) -> [F; 64] {
+        let limbs = decompose_u2048_as_u32x64(self);
+        limbs.map(|el| WitnessCastable::cast_into_source(el))
+    }
+}
+
+impl<F: SmallField> CSWitnessable<F, 64> for UInt2048<F> {
+    type ConversionFunction = Convertor<F, [F; 64], (U1024, U1024)>;
+
+    fn witness_from_set_of_values(values: [F; 64]) -> Self::Witness {
+        WitnessCastable::cast_from_source(values)
+    }
+
+    fn as_variables_set(&self) -> [Variable; 64] {
+        self.inner.map(|el| el.get_variable())
+    }
+}
+
+impl<F: SmallField> WitnessHookable<F> for UInt2048<F> {
+    fn witness_hook<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &CS,
+    ) -> Box<dyn FnOnce() -> Option<Self::Witness>> {
+        let raw_witness = self.get_witness(cs);
+        Box::new(move || raw_witness.wait())
+    }
+}
+
+use crate::gadgets::traits::selectable::MultiSelectable;
+// multiselect doesn't make much sense here because we can do parallel over chunks,
+// so we degrade to default impl via normal select
+impl<F: SmallField> MultiSelectable<F> for UInt2048<F> {}
+
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+
+impl<F: SmallField> CircuitVarLengthEncodable<F> for UInt2048<F> {
+    #[inline(always)]
+    fn encoding_length(&self) -> usize {
+        64
+    }
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        CircuitVarLengthEncodable::<F>::encode_to_buffer(&self.inner, cs, dst);
+    }
+}
+
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+
+impl<F: SmallField> CSPlaceholder<F> for UInt2048<F> {
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::zero(cs)
+    }
+}
diff --git a/src/gadgets/u256/mod.rs b/src/gadgets/u256/mod.rs
index 5047f97..819f802 100644
--- a/src/gadgets/u256/mod.rs
+++ b/src/gadgets/u256/mod.rs
@@ -329,6 +329,13 @@ impl<F: SmallField> UInt256<F> {
         bytes
     }
 
+    #[must_use]
+    pub fn to_u512<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> UInt512<F> {
+        let mut u512 = UInt512::zero(cs);
+        u512.inner[..8].copy_from_slice(&self.inner);
+        u512
+    }
+
     #[must_use]
     pub fn div2<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
         let byte_split_id = cs
@@ -351,6 +358,19 @@ impl<F: SmallField> UInt256<F> {
         });
         Self::from_le_bytes(cs, bytes)
     }
+
+    /// Finds the result of multiplying `self` by `other` mod `modulo`.
+    pub fn modmul<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &UInt256<F>,
+        modulo: &UInt256<F>,
+    ) -> UInt256<F> {
+        // We take 8 limbs since scalar can be of any size
+        let product = self.widening_mul(cs, other, 8, 8);
+        let (_, remainder) = product.long_division(cs, modulo);
+        remainder
+    }
 }
 
 use crate::cs::Variable;
diff --git a/src/gadgets/u4096/mod.rs b/src/gadgets/u4096/mod.rs
new file mode 100644
index 0000000..310ee4b
--- /dev/null
+++ b/src/gadgets/u4096/mod.rs
@@ -0,0 +1,542 @@
+use super::*;
+use crate::cs::traits::cs::ConstraintSystem;
+use crate::cs::traits::cs::DstBuffer;
+use crate::field::SmallField;
+use crate::gadgets::boolean::Boolean;
+use crate::gadgets::traits::allocatable::CSAllocatable;
+use crate::gadgets::traits::allocatable::CSAllocatableExt;
+use crate::gadgets::traits::witnessable::CSWitnessable;
+use crate::gadgets::traits::witnessable::WitnessHookable;
+use crate::gadgets::u32::UInt32;
+use crate::gadgets::u8::UInt8;
+use crypto_bigint::U1024;
+use crypto_bigint::U2048;
+use u2048::UInt2048;
+
+use crate::config::*;
+
+#[derive(Derivative)]
+#[derivative(Clone, Copy, Debug, Hash)]
+pub struct UInt4096<F: SmallField> {
+    pub inner: [UInt32<F>; 128],
+}
+
+pub fn decompose_u4096_as_u32x128(value: (U2048, U2048)) -> [u32; 128] {
+    let low_limbs = value.0.as_limbs();
+    let high_limbs = value.1.as_limbs();
+
+    let mut result = [0u32; 128];
+    // Filling the low limb
+    for i in 0..32 {
+        result[i * 2] = low_limbs[i].0 as u32;
+        result[i * 2 + 1] = (low_limbs[i].0 >> 32) as u32;
+    }
+    // Filling the high limb
+    for i in 0..32 {
+        result[i * 2 + 64] = high_limbs[i].0 as u32;
+        result[i * 2 + 1 + 64] = (high_limbs[i].0 >> 32) as u32;
+    }
+
+    result
+}
+
+pub fn recompose_u4096_as_u32x128(value: [u32; 128]) -> (U2048, U2048) {
+    // Filling the low limb
+    let mut low = [0u64; 32];
+    for i in 0..32 {
+        low[i] = (value[i * 2] as u64) | ((value[i * 2 + 1] as u64) << 32);
+    }
+
+    // Filling the high limb
+    let mut high = [0u64; 32];
+    for i in 0..32 {
+        high[i] = (value[i * 2 + 32] as u64) | ((value[i * 2 + 1 + 32] as u64) << 32);
+    }
+
+    (U2048::from_words(low), U2048::from_words(high))
+}
+
+pub fn convert_limb_to_u4096<F, CS>(cs: &mut CS, limb: &UInt32<F>) -> UInt4096<F>
+where
+    F: SmallField,
+    CS: ConstraintSystem<F>,
+{
+    let mut u4096 = UInt4096::zero(cs);
+    u4096.inner[0] = *limb;
+    u4096
+}
+
+impl<F: SmallField> CSAllocatable<F> for UInt4096<F> {
+    type Witness = (U2048, U2048);
+    fn placeholder_witness() -> Self::Witness {
+        (U2048::ZERO, U2048::ZERO)
+    }
+
+    #[inline(always)]
+    #[must_use]
+    fn allocate_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        let vars = cs.alloc_multiple_variables_without_values::<128>();
+
+        let as_u32 = vars.map(|el| UInt32::from_variable_checked(cs, el));
+
+        Self { inner: as_u32 }
+    }
+
+    #[must_use]
+    fn allocate<CS: ConstraintSystem<F>>(cs: &mut CS, witness: Self::Witness) -> Self {
+        let chunks = decompose_u4096_as_u32x128(witness);
+        let chunks = chunks.map(|el| UInt32::allocate_checked(cs, el));
+        Self { inner: chunks }
+    }
+}
+
+impl<F: SmallField> CSAllocatableExt<F> for UInt4096<F> {
+    const INTERNAL_STRUCT_LEN: usize = 128;
+
+    fn witness_from_set_of_values(values: [F; Self::INTERNAL_STRUCT_LEN]) -> Self::Witness {
+        // value
+        recompose_u4096_as_u32x128(
+            values.map(|el| <u32 as WitnessCastable<F, F>>::cast_from_source(el)),
+        )
+    }
+
+    // we should be able to allocate without knowing values yet
+    fn create_without_value<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocate_without_value(cs)
+    }
+
+    fn flatten_as_variables(&self) -> [Variable; Self::INTERNAL_STRUCT_LEN]
+    where
+        [(); Self::INTERNAL_STRUCT_LEN]:,
+    {
+        self.inner.map(|el| el.get_variable())
+    }
+
+    fn set_internal_variables_values(witness: Self::Witness, dst: &mut DstBuffer<'_, '_, F>) {
+        decompose_u4096_as_u32x128(witness)
+            .map(|el| UInt32::set_internal_variables_values(el, dst));
+    }
+}
+
+use crate::gadgets::traits::selectable::Selectable;
+
+impl<F: SmallField> Selectable<F> for UInt4096<F> {
+    #[must_use]
+    fn conditionally_select<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        flag: Boolean<F>,
+        a: &Self,
+        b: &Self,
+    ) -> Self {
+        let inner = Selectable::conditionally_select(cs, flag, &a.inner, &b.inner);
+
+        Self { inner }
+    }
+}
+
+impl<F: SmallField> UInt4096<F> {
+    #[must_use]
+    pub fn allocated_constant<CS: ConstraintSystem<F>>(
+        cs: &mut CS,
+        constant: (U2048, U2048),
+    ) -> Self {
+        debug_assert!(F::CAPACITY_BITS >= 32);
+
+        let chunks = decompose_u4096_as_u32x128(constant);
+        let chunks = chunks.map(|el| UInt32::allocated_constant(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn allocate_from_closure_and_dependencies<
+        CS: ConstraintSystem<F>,
+        FN: FnOnce(&[F]) -> (U2048, U2048) + 'static + Send + Sync,
+    >(
+        cs: &mut CS,
+        witness_closure: FN,
+        dependencies: &[Place],
+    ) -> Self {
+        let outputs = cs.alloc_multiple_variables_without_values::<128>();
+
+        if <CS::Config as CSConfig>::WitnessConfig::EVALUATE_WITNESS {
+            let value_fn = move |inputs: &[F], output_buffer: &mut DstBuffer<'_, '_, F>| {
+                debug_assert!(F::CAPACITY_BITS >= 64);
+                let witness = (witness_closure)(inputs);
+                let chunks = decompose_u4096_as_u32x128(witness);
+
+                output_buffer.extend(chunks.map(|el| F::from_u64_unchecked(el as u64)));
+            };
+
+            cs.set_values_with_dependencies_vararg(
+                dependencies,
+                &Place::from_variables(outputs),
+                value_fn,
+            );
+        }
+
+        let chunks = outputs.map(|el| UInt32::from_variable_checked(cs, el));
+        Self { inner: chunks }
+    }
+
+    #[must_use]
+    pub fn zero<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::allocated_constant(cs, (U2048::ZERO, U2048::ZERO))
+    }
+
+    /// Returns `true` if `self >= other`, and `false` otherwise.
+    /// Here, `self` and `other` are represented as [`UInt4096<F>`] and [`UInt2048<F>`] respectively.
+    #[must_use]
+    pub fn geq_than_u2048<CS>(&self, cs: &mut CS, other: &UInt2048<F>) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let high = self.to_high();
+        let under_2048 = high.is_zero(cs);
+        let over_2048 = under_2048.negated(cs);
+        let low = self.to_low();
+        let (sub, overflow) = other.overflowing_sub(cs, &low);
+        let a_equal_b = sub.is_zero(cs);
+        Boolean::multi_or(cs, &[overflow, a_equal_b, over_2048])
+    }
+
+    #[must_use]
+    pub fn overflowing_add<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut carry_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, carry) = (*a).overflowing_add_with_carry_in(cs, *b, carry_out);
+            *dst = c;
+            carry_out = carry;
+        }
+
+        (result, carry_out)
+    }
+
+    #[must_use]
+    pub fn overflowing_sub<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        other: &Self,
+    ) -> (Self, Boolean<F>) {
+        let mut borrow_out = Boolean::allocated_constant(cs, false);
+        let mut result = *self; // any uninit would be fine too
+        for ((a, b), dst) in self
+            .inner
+            .iter()
+            .zip(other.inner.iter())
+            .zip(result.inner.iter_mut())
+        {
+            let (c, borrow) = (*a).overflowing_sub_with_borrow_in(cs, *b, borrow_out);
+            *dst = c;
+            borrow_out = borrow;
+        }
+
+        (result, borrow_out)
+    }
+
+    /// Multiplies a number by 2^{32}. Panics if the number overflows.
+    #[must_use]
+    pub fn must_mul_by_two_pow_32<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        let boolean_true = Boolean::allocated_constant(cs, true);
+        let last_limb_zero = self.inner[127].is_zero(cs);
+        Boolean::enforce_equal(cs, &last_limb_zero, &boolean_true);
+
+        let mut new_inner = self.inner;
+        new_inner.copy_within(0..127, 1);
+        new_inner[0] = UInt32::zero(cs);
+
+        Self { inner: new_inner }
+    }
+
+    /// Find quotient and remainder of division of `self` by `other` using the naive long division algorithm in base `2^{32}`
+    /// since both [`UInt4096<F>`] and [`UInt2048<F>`] are represented as arrays of [`UInt32<F>`]. The implementation is based on
+    /// algorithm https://en.wikipedia.org/wiki/Long_division#Algorithm_for_arbitrary_base,
+    /// where `k=128`, `l=64`, and base `b=2^{32}`.
+    #[must_use]
+    pub fn long_division<CS>(&self, cs: &mut CS, other: &UInt2048<F>) -> (UInt4096<F>, UInt2048<F>)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        const U2048_MAX_LIMBS: usize = 64;
+        const U4096_MAX_LIMBS: usize = 128;
+        const MAX_BINARY_SEARCH_ITERATIONS: usize = 1025;
+
+        // Initializing constants
+        let base = U1024::from_le_hex("0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000");
+        let base = UInt2048::allocated_constant(cs, (base, U1024::ZERO));
+        let boolean_false = Boolean::allocated_constant(cs, false);
+        let one = UInt2048::allocated_constant(cs, (U1024::ONE, U1024::ZERO));
+
+        // q <- 0
+        let mut q = UInt4096::zero(cs);
+
+        // r <- first 63 limbs of self, thus it fits in UInt2048
+        let mut r = self.to_high();
+        r.inner[0] = UInt32::zero(cs);
+        r.inner.copy_within(1..U2048_MAX_LIMBS, 0);
+        r.inner[U2048_MAX_LIMBS - 1] = UInt32::zero(cs);
+
+        for i in 0..U2048_MAX_LIMBS + 1 {
+            // \alpha_{i+l-1} is (k-l-i)th limb of n
+            let alpha = self.inner[U2048_MAX_LIMBS - i];
+            let alpha = convert_limb_to_u4096(cs, &alpha);
+
+            // d_i <- b*r_{i-1} + \alpha_{i+l-1}
+            // d_i can safely be UInt512 in size.
+            // r can have any number of limbs up to 8.
+            // base is 2 limbs wide since b=(2^{32}-1)+1
+            // TODO: Mul by base might be optimized
+            let d = base.widening_mul(cs, &r, 2, 8);
+            let (d_plus_alpha, overflow) = d.overflowing_add(cs, &alpha);
+            // d_i cannot overflow UInt512
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+            let d = d_plus_alpha;
+
+            // beta_i <- next digit of quotient. We use
+            // binary search to find suitable beta_i
+            let mut beta = UInt2048::zero(cs);
+            let mut left = UInt2048::zero(cs);
+            let mut right = base;
+
+            // Preparing new_r to further update r
+            let mut new_r = UInt4096::zero(cs);
+
+            for _ in 0..MAX_BINARY_SEARCH_ITERATIONS {
+                // beta <- ceil((right + left) / 2)
+                let (new_beta, overflow) = right.overflowing_add(cs, &left);
+                // Cannot overflow since right and left are less than b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+
+                // Since new_beta.div2 gives floor, we need to add 1 if new_beta is odd to get ceil
+                let odd = new_beta.is_odd(cs);
+                let beta_div_2 = new_beta.div2(cs);
+                let (beta_div_2_plus_1, overflow) = beta_div_2.overflowing_add(cs, &one);
+                // Cannot overflow since beta_div_2+one is less than b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+                beta = UInt2048::conditionally_select(cs, odd, &beta_div_2_plus_1, &beta_div_2);
+
+                // r <- d - m * beta
+                // beta can fit in 2 limbs since it is less or equal to b=2^{32}
+                let m_beta = other.widening_mul(cs, &beta, 8, 2);
+                let (r, r_negative) = d.overflowing_sub(cs, &m_beta);
+
+                // if r < 0 (that is, overflow occurred), then right <- beta - 1
+                // beta - 1 might overflow at step 33, but we don't care about it
+                let (beta_minus_1, _) = beta.overflowing_sub(cs, &one);
+                right = UInt2048::conditionally_select(cs, r_negative, &beta_minus_1, &right);
+
+                // if r >= m, then left <- beta + 1
+                let r_geq_m = r.geq_than_u2048(cs, other);
+                // We should handle the case when r overflowed
+                let r_positive = r_negative.negated(cs);
+                let r_greater_m = r_geq_m.and(cs, r_positive);
+                let (beta_plus_1, overflow) = beta.overflowing_add(cs, &one);
+                // Cannot overflow since beta < b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+                left = UInt2048::conditionally_select(cs, r_greater_m, &beta_plus_1, &left);
+
+                // Updating r
+                new_r = r
+            }
+
+            // Asserting that new_r is indeed fits in UInt256
+            let boolean_true = Boolean::allocated_constant(cs, true);
+            for limb in new_r.inner[8..].iter() {
+                let limb_is_zero = limb.is_zero(cs);
+                Boolean::enforce_equal(cs, &limb_is_zero, &boolean_true);
+            }
+            // Update r
+            r = new_r.to_low();
+
+            // Asserting that r < m
+            let (_, overflow) = other.overflowing_sub(cs, &r);
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+
+            // q_i <- b*q_{i-1} + beta_i
+            let beta_u512 = beta.to_u4096(cs);
+            q = q.must_mul_by_two_pow_32(cs);
+            let (new_q, overflow) = q.overflowing_add(cs, &beta_u512);
+            // Cannot overflow since quotient cannot exceed 2^{512}
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+            q = new_q;
+        }
+
+        (q, r)
+    }
+
+    // Returns the value unchanges if `bit` is `true`, and 0 otherwise
+    #[must_use]
+    pub fn mask<CS: ConstraintSystem<F>>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self {
+        let new_inner = self.inner.map(|el| el.mask(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    // Returns the value unchanges if `bit` is `false`, and 0 otherwise
+    #[must_use]
+    pub fn mask_negated<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &mut CS,
+        masking_bit: Boolean<F>,
+    ) -> Self {
+        let new_inner = self.inner.map(|el| el.mask_negated(cs, masking_bit));
+        Self { inner: new_inner }
+    }
+
+    #[must_use]
+    pub fn equals<CS: ConstraintSystem<F>>(cs: &mut CS, a: &Self, b: &Self) -> Boolean<F> {
+        let equals: [_; 128] =
+            std::array::from_fn(|idx| UInt32::equals(cs, &a.inner[idx], &b.inner[idx]));
+
+        Boolean::multi_and(cs, &equals)
+    }
+
+    #[must_use]
+    pub fn from_le_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 512]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 128];
+        for (dst, src) in inner.iter_mut().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_le_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn from_limbs(limbs: [UInt32<F>; 128]) -> Self {
+        Self { inner: limbs }
+    }
+
+    #[must_use]
+    pub fn from_be_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 512]) -> Self {
+        let mut inner = [std::mem::MaybeUninit::uninit(); 128];
+        for (dst, src) in inner.iter_mut().rev().zip(bytes.array_chunks::<4>()) {
+            dst.write(UInt32::from_be_bytes(cs, *src));
+        }
+
+        let inner = unsafe { inner.map(|el| el.assume_init()) };
+
+        Self { inner }
+    }
+
+    #[must_use]
+    pub fn is_zero<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Boolean<F> {
+        let limbs_are_zero = self.inner.map(|el| el.is_zero(cs));
+        Boolean::multi_and(cs, &limbs_are_zero)
+    }
+
+    #[must_use]
+    pub fn to_le_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 512] {
+        let mut encoding = [std::mem::MaybeUninit::uninit(); 512];
+        for (dst, src) in encoding
+            .iter_mut()
+            .zip(self.inner.iter().flat_map(|el| el.to_le_bytes(cs)))
+        {
+            dst.write(src);
+        }
+
+        unsafe { encoding.map(|el| el.assume_init()) }
+    }
+
+    #[must_use]
+    pub fn to_be_bytes<CS: ConstraintSystem<F>>(self, cs: &mut CS) -> [UInt8<F>; 512] {
+        let mut bytes = self.to_le_bytes(cs);
+        bytes.reverse();
+
+        bytes
+    }
+
+    #[must_use]
+    pub fn to_low(self) -> UInt2048<F> {
+        UInt2048 {
+            inner: self.inner[..64].try_into().expect("incorrect slice size"),
+        }
+    }
+
+    #[must_use]
+    pub fn to_high(self) -> UInt2048<F> {
+        UInt2048 {
+            inner: self.inner[64..].try_into().expect("incorrect slice size"),
+        }
+    }
+}
+
+use crate::cs::Variable;
+use crate::gadgets::traits::castable::Convertor;
+use crate::gadgets::traits::castable::WitnessCastable;
+
+impl<F: SmallField> WitnessCastable<F, [F; 128]> for (U2048, U2048) {
+    #[inline]
+    fn cast_from_source(witness: [F; 128]) -> Self {
+        let reduced = witness.map(|el| {
+            let el = el.as_u64_reduced();
+            debug_assert!(el <= u32::MAX as u64);
+
+            el as u32
+        });
+
+        recompose_u4096_as_u32x128(reduced)
+    }
+
+    #[inline]
+    fn cast_into_source(self) -> [F; 128] {
+        let limbs = decompose_u4096_as_u32x128(self);
+        limbs.map(|el| WitnessCastable::cast_into_source(el))
+    }
+}
+
+impl<F: SmallField> CSWitnessable<F, 128> for UInt4096<F> {
+    type ConversionFunction = Convertor<F, [F; 128], (U2048, U2048)>;
+
+    fn witness_from_set_of_values(values: [F; 128]) -> Self::Witness {
+        WitnessCastable::cast_from_source(values)
+    }
+
+    fn as_variables_set(&self) -> [Variable; 128] {
+        self.inner.map(|el| el.get_variable())
+    }
+}
+
+impl<F: SmallField> WitnessHookable<F> for UInt4096<F> {
+    fn witness_hook<CS: ConstraintSystem<F>>(
+        &self,
+        cs: &CS,
+    ) -> Box<dyn FnOnce() -> Option<Self::Witness>> {
+        let raw_witness = self.get_witness(cs);
+        Box::new(move || raw_witness.wait())
+    }
+}
+
+use crate::gadgets::traits::selectable::MultiSelectable;
+// multiselect doesn't make much sense here because we can do parallel over chunks,
+// so we degrade to default impl via normal select
+impl<F: SmallField> MultiSelectable<F> for UInt4096<F> {}
+
+use crate::gadgets::traits::encodable::CircuitVarLengthEncodable;
+
+impl<F: SmallField> CircuitVarLengthEncodable<F> for UInt4096<F> {
+    #[inline(always)]
+    fn encoding_length(&self) -> usize {
+        64
+    }
+    fn encode_to_buffer<CS: ConstraintSystem<F>>(&self, cs: &mut CS, dst: &mut Vec<Variable>) {
+        CircuitVarLengthEncodable::<F>::encode_to_buffer(&self.inner, cs, dst);
+    }
+}
+
+use crate::gadgets::traits::allocatable::CSPlaceholder;
+
+impl<F: SmallField> CSPlaceholder<F> for UInt4096<F> {
+    fn placeholder<CS: ConstraintSystem<F>>(cs: &mut CS) -> Self {
+        Self::zero(cs)
+    }
+}
diff --git a/src/gadgets/u512/mod.rs b/src/gadgets/u512/mod.rs
index bb3781b..43b435f 100644
--- a/src/gadgets/u512/mod.rs
+++ b/src/gadgets/u512/mod.rs
@@ -56,6 +56,16 @@ pub fn recompose_u512_as_u32x16(value: [u32; 16]) -> (U256, U256) {
     (result_1, result_2)
 }
 
+pub fn convert_limb_to_u512<F, CS>(cs: &mut CS, limb: &UInt32<F>) -> UInt512<F>
+where
+    F: SmallField,
+    CS: ConstraintSystem<F>,
+{
+    let mut u512 = UInt512::zero(cs);
+    u512.inner[0] = *limb;
+    u512
+}
+
 impl<F: SmallField> CSAllocatable<F> for UInt512<F> {
     type Witness = (U256, U256);
     fn placeholder_witness() -> Self::Witness {
@@ -216,6 +226,153 @@ impl<F: SmallField> UInt512<F> {
         (result, borrow_out)
     }
 
+    /// Multiplies a number by 2^{32}. Panics if the number overflows.
+    #[must_use]
+    pub fn must_mul_by_2_pow_32<CS: ConstraintSystem<F>>(&self, cs: &mut CS) -> Self {
+        let boolean_true = Boolean::allocated_constant(cs, true);
+        let last_limb_zero = self.inner[15].is_zero(cs);
+        Boolean::enforce_equal(cs, &last_limb_zero, &boolean_true);
+
+        let mut new_inner = self.inner;
+        new_inner.copy_within(0..15, 1);
+        new_inner[0] = UInt32::zero(cs);
+
+        Self { inner: new_inner }
+    }
+
+    /// Returns `true` if `self >= other`, and `false` otherwise.
+    /// Here, `self` and `other` are represented as `UInt512<F>` and `UInt256<F>` respectively.
+    pub fn geq_than_u256<CS>(&self, cs: &mut CS, other: &UInt256<F>) -> Boolean<F>
+    where
+        CS: ConstraintSystem<F>,
+    {
+        let high = self.to_high();
+        let under_256 = high.is_zero(cs);
+        let over_256 = under_256.negated(cs);
+        let low = self.to_low();
+        let (sub, overflow) = other.overflowing_sub(cs, &low);
+        let a_equal_b = sub.is_zero(cs);
+        Boolean::multi_or(cs, &[overflow, a_equal_b, over_256])
+    }
+
+    /// Find quotient and remainder of division of `self` by `other` using the naive long division algorithm in base 2^{32}
+    /// since both [`UInt512<F>`] and [`UInt256<F>`] are represented as arrays of [`UInt32<F>`]. The implementation is based on
+    /// algorithm https://en.wikipedia.org/wiki/Long_division#Algorithm_for_arbitrary_base,
+    /// where `k=16`, `l=8`, and base `b=2^{32}`.
+    ///
+    /// Currently, only the division by [`UInt256<F>`] is supported.
+    #[must_use]
+    pub fn long_division<CS>(&self, cs: &mut CS, other: &UInt256<F>) -> (UInt512<F>, UInt256<F>)
+    where
+        CS: ConstraintSystem<F>,
+    {
+        const U256_MAX_LIMBS: usize = 8;
+        const U512_MAX_LIMBS: usize = 16;
+        const MAX_BINARY_SEARCH_ITERATIONS: usize = 33;
+
+        // Initializing constants
+        let base = U256::from_str_radix("0x100000000", 16).unwrap();
+        let base = UInt256::allocated_constant(cs, base);
+        let boolean_false = Boolean::allocated_constant(cs, false);
+        let one = UInt256::allocated_constant(cs, U256::one());
+
+        // q <- 0
+        let mut q = UInt512::zero(cs);
+
+        // r <- first 7 limbs of n, thus it fits in UInt256
+        let mut r = self.to_high();
+        r.inner[0] = UInt32::zero(cs);
+        r.inner.copy_within(1..U256_MAX_LIMBS, 0);
+        r.inner[U256_MAX_LIMBS - 1] = UInt32::zero(cs);
+
+        for i in 0..U256_MAX_LIMBS + 1 {
+            // \alpha_{i+l-1} is (k-l-i)th limb of n
+            let alpha = self.inner[U256_MAX_LIMBS - i];
+            let alpha = convert_limb_to_u512(cs, &alpha);
+
+            // d_i <- b*r_{i-1} + \alpha_{i+l-1}
+            // d_i can safely be UInt512 in size.
+            // r can have any number of limbs up to 8.
+            // base is 2 limbs wide since b=(2^{32}-1)+1
+            // TODO: Mul by base might be optimized
+            let d = base.widening_mul(cs, &r, 2, 8);
+            let (d_plus_alpha, overflow) = d.overflowing_add(cs, &alpha);
+            // d_i cannot overflow UInt512
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+            let d = d_plus_alpha;
+
+            // beta_i <- next digit of quotient. We use
+            // binary search to find suitable beta_i
+            let mut beta = UInt256::zero(cs);
+            let mut left = UInt256::zero(cs);
+            let mut right = base;
+
+            // Preparing new_r to further update r
+            let mut new_r = UInt512::zero(cs);
+
+            for _ in 0..MAX_BINARY_SEARCH_ITERATIONS {
+                // beta <- ceil((right + left) / 2)
+                let (new_beta, overflow) = right.overflowing_add(cs, &left);
+                // Cannot overflow since right and left are less than b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+
+                // Since new_beta.div2 gives floor, we need to add 1 if new_beta is odd to get ceil
+                let odd = new_beta.is_odd(cs);
+                let beta_div_2 = new_beta.div2(cs);
+                let (beta_div_2_plus_1, overflow) = beta_div_2.overflowing_add(cs, &one);
+                // Cannot overflow since beta_div_2+one is less than b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+                beta = UInt256::conditionally_select(cs, odd, &beta_div_2_plus_1, &beta_div_2);
+
+                // r <- d - m * beta
+                // beta can fit in 2 limbs since it is less or equal to b=2^{32}
+                let m_beta = other.widening_mul(cs, &beta, 8, 2);
+                let (r, r_negative) = d.overflowing_sub(cs, &m_beta);
+
+                // if r < 0 (that is, overflow occurred), then right <- beta - 1
+                // beta - 1 might overflow at step 33, but we don't care about it
+                let (beta_minus_1, _) = beta.overflowing_sub(cs, &one);
+                right = UInt256::conditionally_select(cs, r_negative, &beta_minus_1, &right);
+
+                // if r >= m, then left <- beta + 1
+                let r_geq_m = r.geq_than_u256(cs, other);
+                // We should handle the case when r overflowed
+                let r_positive = r_negative.negated(cs);
+                let r_greater_m = r_geq_m.and(cs, r_positive);
+                let (beta_plus_1, overflow) = beta.overflowing_add(cs, &one);
+                // Cannot overflow since beta < b=2^{32}
+                Boolean::enforce_equal(cs, &overflow, &boolean_false);
+                left = UInt256::conditionally_select(cs, r_greater_m, &beta_plus_1, &left);
+
+                // Updating r
+                new_r = r
+            }
+
+            // Asserting that new_r is indeed fits in UInt256
+            let boolean_true = Boolean::allocated_constant(cs, true);
+            for limb in new_r.inner[8..].iter() {
+                let limb_is_zero = limb.is_zero(cs);
+                Boolean::enforce_equal(cs, &limb_is_zero, &boolean_true);
+            }
+            // Update r
+            r = new_r.to_low();
+
+            // Asserting that r < m
+            let (_, overflow) = other.overflowing_sub(cs, &r);
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+
+            // q_i <- b*q_{i-1} + beta_i
+            let beta_u512 = beta.to_u512(cs);
+            q = q.must_mul_by_2_pow_32(cs);
+            let (new_q, overflow) = q.overflowing_add(cs, &beta_u512);
+            // Cannot overflow since quotient cannot exceed 2^{512}
+            Boolean::enforce_equal(cs, &overflow, &boolean_false);
+            q = new_q;
+        }
+
+        (q, r)
+    }
+
     // Returns the value unchanges if `bit` is `true`, and 0 otherwise
     #[must_use]
     pub fn mask<CS: ConstraintSystem<F>>(&self, cs: &mut CS, masking_bit: Boolean<F>) -> Self {
@@ -254,6 +411,11 @@ impl<F: SmallField> UInt512<F> {
         Self { inner }
     }
 
+    #[must_use]
+    pub fn from_limbs(limbs: [UInt32<F>; 16]) -> Self {
+        Self { inner: limbs }
+    }
+
     #[must_use]
     pub fn from_be_bytes<CS: ConstraintSystem<F>>(cs: &mut CS, bytes: [UInt8<F>; 64]) -> Self {
         let mut inner = [std::mem::MaybeUninit::uninit(); 16];
diff --git a/src/implementations/poseidon2/mod.rs b/src/implementations/poseidon2/mod.rs
index ecb1326..6dbb7e0 100644
--- a/src/implementations/poseidon2/mod.rs
+++ b/src/implementations/poseidon2/mod.rs
@@ -4,18 +4,25 @@ use crate::field::goldilocks::GoldilocksField;
 pub mod params;
 
 pub mod state_generic_impl;
-#[cfg(not(any(
-    target_feature = "neon",
-    target_feature = "avx2",
-    target_feature = "avx512bw",
-    target_feature = "avx512cd",
-    target_feature = "avx512dq",
-    target_feature = "avx512f",
-    target_feature = "avx512vl"
+#[cfg(not(all(
+    feature = "include_packed_simd",
+    any(
+        target_feature = "neon",
+        target_feature = "avx2",
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl",
+    )
 )))]
 pub use state_generic_impl::*;
 
+// Other poseidon implementations depend on packed_simd 128
+// which is no longer available in std::simd (and packed_simd is no longer
+// supported in the newest rust nightly).
 #[cfg(all(
+    feature = "include_packed_simd",
     any(target_feature = "neon", target_feature = "avx2"),
     not(any(
         target_feature = "avx512bw",
@@ -28,6 +35,7 @@ pub use state_generic_impl::*;
 pub mod state_vectorized_double;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     any(target_feature = "neon", target_feature = "avx2"),
     not(any(
         target_feature = "avx512bw",
@@ -40,6 +48,7 @@ pub mod state_vectorized_double;
 pub use state_vectorized_double::*;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     target_feature = "avx512bw",
     target_feature = "avx512cd",
     target_feature = "avx512dq",
@@ -49,6 +58,7 @@ pub use state_vectorized_double::*;
 pub mod state_avx512;
 
 #[cfg(all(
+    feature = "include_packed_simd",
     target_feature = "avx512bw",
     target_feature = "avx512cd",
     target_feature = "avx512dq",
diff --git a/src/implementations/poseidon2/state_generic_impl.rs b/src/implementations/poseidon2/state_generic_impl.rs
index 02cb079..c9b74e8 100644
--- a/src/implementations/poseidon2/state_generic_impl.rs
+++ b/src/implementations/poseidon2/state_generic_impl.rs
@@ -29,7 +29,9 @@ impl State {
     pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
     pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
     pub const EPSILON: u64 = (1 << 32) - 1;
+    #[cfg(feature = "include_packed_simd")]
     pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
+    #[cfg(feature = "include_packed_simd")]
     pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
 
     pub const RATE: usize = poseidon_goldilocks_params::RATE;
diff --git a/src/lib.rs b/src/lib.rs
index 72775d5..4f2e1ee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,6 +21,9 @@
 #![allow(dead_code)]
 #![allow(dropping_references)] // Required to explicitly show that mutable references are dropped.
 #![allow(incomplete_features)]
+#![allow(internal_features)] // Required for core_intrinsics
+#![allow(stable_features)]
+#![allow(unused_unsafe)]
 // Enabled features
 #![feature(allocator_api)]
 #![feature(const_mut_refs)]
@@ -43,7 +46,6 @@
 #![feature(generic_const_exprs)]
 #![feature(iter_array_chunks)]
 // #![recursion_limit = "1024"]
-#![feature(stdsimd)]
 #![feature(avx512_target_feature)]
 #![feature(associated_type_defaults)]
 #![feature(trait_alias)]
@@ -51,6 +53,7 @@
 #![feature(return_position_impl_trait_in_trait)]
 #![feature(type_changing_struct_update)]
 #![feature(slice_flatten)]
+#![cfg_attr(feature = "include_packed_simd", feature(stdsimd))]
 
 pub mod algebraic_props;
 pub mod config;