diff --git a/constraint-evaluation-generator/src/constraints.rs b/constraint-evaluation-generator/src/constraints.rs
index 376a1852..803d5f93 100644
--- a/constraint-evaluation-generator/src/constraints.rs
+++ b/constraint-evaluation-generator/src/constraints.rs
@@ -108,13 +108,6 @@ impl Constraints {
         .concat()
     }
 
-    pub fn fold_constants(&mut self) {
-        ConstraintCircuitMonad::constant_folding(&mut self.init);
-        ConstraintCircuitMonad::constant_folding(&mut self.cons);
-        ConstraintCircuitMonad::constant_folding(&mut self.tran);
-        ConstraintCircuitMonad::constant_folding(&mut self.term);
-    }
-
     pub fn lower_to_target_degree_through_substitutions(&mut self) -> AllSubstitutions {
         // Subtract the degree lowering table's width from the total number of columns to guarantee
         // the same number of columns even for repeated runs of the constraint evaluation generator.
diff --git a/constraint-evaluation-generator/src/main.rs b/constraint-evaluation-generator/src/main.rs
index a4ce42cf..317ec904 100644
--- a/constraint-evaluation-generator/src/main.rs
+++ b/constraint-evaluation-generator/src/main.rs
@@ -13,7 +13,6 @@ mod substitution;
 
 fn main() {
     let mut constraints = Constraints::all();
-    constraints.fold_constants();
     let substitutions = constraints.lower_to_target_degree_through_substitutions();
     let degree_lowering_table_code = substitutions.generate_degree_lowering_table_code();
 
@@ -45,7 +44,6 @@ mod tests {
     #[test]
     fn degree_lowering_tables_code_can_be_generated_for_test_constraints() {
         let mut constraints = Constraints::test_constraints();
-        constraints.fold_constants();
         let substitutions = constraints.lower_to_target_degree_through_substitutions();
         let _ = substitutions.generate_degree_lowering_table_code();
     }
@@ -58,7 +56,6 @@ mod tests {
     #[test]
     fn degree_lowering_tables_code_can_be_generated_from_all_constraints() {
         let mut constraints = Constraints::all();
-        constraints.fold_constants();
         let substitutions = constraints.lower_to_target_degree_through_substitutions();
         let _ = substitutions.generate_degree_lowering_table_code();
     }
@@ -66,7 +63,6 @@ mod tests {
     #[test]
     fn constraints_and_substitutions_can_be_combined() {
         let mut constraints = Constraints::test_constraints();
-        constraints.fold_constants();
         let substitutions = constraints.lower_to_target_degree_through_substitutions();
         let _ = constraints.combine_with_substitution_induced_constraints(substitutions);
     }
diff --git a/specification/src/img/instruction_groups_small.png b/specification/src/img/instruction_groups_small.png
deleted file mode 100644
index 7225e034..00000000
Binary files a/specification/src/img/instruction_groups_small.png and /dev/null differ
diff --git a/specification/src/instruction-groups.md b/specification/src/instruction-groups.md
index f21a2a7c..17b9d5ee 100644
--- a/specification/src/instruction-groups.md
+++ b/specification/src/instruction-groups.md
@@ -12,67 +12,71 @@ The following table lists and briefly explains all instruction groups.
 | group name                   | description                                                                                                                                              |
 |:-----------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `decompose_arg`              | instruction's argument held in `nia` is binary decomposed into helper registers `hv0` through `hv3`                                                      |
-| `prohibit_illegal_num_words` | constrain the instruction's argument `n` to 1 ⩽ `n` ⩽ 5                                                                                                  |
+| `prohibit_illegal_num_words` | constrain the instruction's argument `n` to 1 ⩽ `n` ⩽ 5                                                                                                 |
 | `no_io`                      | the running evaluations for public input & output remain unchanged                                                                                       |
-| `keep_ram`                   | RAM does not change, _i.e._, the running product of the [Permutation Argument](permutation-argument.md) with the RAM Table remains unchanged             |
+| `no_ram`                     | RAM is not being accessed, _i.e._, the running product of the [Permutation Argument](permutation-argument.md) with the RAM Table remains unchanged       |
 | `keep_jump_stack`            | jump stack does not change, _i.e._, registers `jsp`, `jso`, and `jsd` do not change                                                                      |
 | `step_1`                     | jump stack does not change and instruction pointer `ip` increases by 1                                                                                   |
 | `step_2`                     | jump stack does not change and instruction pointer `ip` increases by 2                                                                                   |
 | `grow_op_stack`              | op stack elements are shifted down by one position, top element of the resulting stack is unconstrained                                                  |
 | `grow_op_stack_by_any_of`    | op stack elements are shifted down by `n` positions, top `n` elements of the resulting stack are unconstrained, where `n` is the instruction's argument  |
-| `unary_operation`            | op stack's top-most element is unconstrained, rest of stack remains unchanged                                                                            |
-| `keep_op_stack`              | op stack remains unchanged, _i.e._, the running product of the [Permutation Argument](permutation-argument.md) with the Op Stack Table remains unchanged |
+| `keep_op_stack_height`       | the op stack height remains unchanged, and so in particular the running product of the [Permutation Argument](permutation-argument.md) with the Op Stack table remains unchanged |
+| `op_stack_remains_except_top_n_elements_unconstrained` | all but the top `n` elements of the op stack remain unchanged                                                |
+| `keep_op_stack`              | op stack remains entirely unchanged                                                                                                                      |
 | `binary_operation`           | op stack elements starting from `st2` are shifted up by one position, highest two elements of the resulting stack are unconstrained                      |
 | `shrink_op_stack`            | op stack elements starting from `st1` are shifted up by one position                                                                                     |
 | `shrink_op_stack_by_any_of`  | op stack elements starting from `stn` are shifted up by one position, where `n` is the instruction's argument                                            |
 
-Below figure gives a comprehensive overview over the subset relation between all instruction groups.
+The following instruction groups conceptually fit the category of 'instruction groups' but are not used or enforced through AIR constraints.
 
-<!-- Created using https://flowchart.fun, not recommended anymore since they changed their subscription model which is not worth our limited use -->
-![](img/instruction_groups_small.png)
+| group name | description                                                                                                                                                                |
+|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `no_hash`  | The hash coprocessor is not accessed. The constraints are implied by the evaluation argument with the Hash Table which takes the current instruction into account.         |
 
 A summary of all instructions and which groups they are part of is given in the following table.
 
-| instruction       | `decompose_arg` | `prohibit_illegal_num_words` | `no_io` | `keep_ram` | `keep_jump_stack` | `step_1` | `step_2` | `grow_op_stack` | `grow_op_stack_by_any_of` | `unary_operation` | `keep_op_stack` | `binary_operation` | `shrink_op_stack` | `shrink_op_stack_by_any_of` |
-|:------------------|:---------------:|:----------------------------:|:-------:|:----------:|:-----------------:|:--------:|:--------:|:---------------:|:-------------------------:|:-----------------:|:---------------:|:------------------:|:-----------------:|:---------------------------:|
-| `pop` + `n`       |        x        |              x               |    x    |     x      |                   |          |    x     |                 |                           |                   |                 |                    |                   |              x              |
-| `push` + `a`      |                 |                              |    x    |     x      |                   |          |    x     |        x        |                           |                   |                 |                    |                   |                             |
-| `divine`  + `n`   |        x        |              x               |    x    |     x      |                   |          |    x     |                 |             x             |                   |                 |                    |                   |                             |
-| `dup`  + `i`      |        x        |                              |    x    |     x      |                   |          |    x     |        x        |                           |                   |                 |                    |                   |                             |
-| `swap` + `i`      |        x        |                              |    x    |     x      |                   |          |    x     |                 |                           |                   |                 |                    |                   |                             |
-| `nop`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |        x        |                    |                   |                             |
-| `skiz`            |                 |                              |    x    |     x      |         x         |          |          |                 |                           |                   |                 |                    |         x         |                             |
-| `call` + `d`      |                 |                              |    x    |     x      |                   |          |          |                 |                           |                   |        x        |                    |                   |                             |
-| `return`          |                 |                              |    x    |     x      |                   |          |          |                 |                           |                   |        x        |                    |                   |                             |
-| `recurse`         |                 |                              |    x    |     x      |         x         |          |          |                 |                           |                   |        x        |                    |                   |                             |
-| `assert`          |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |         x         |                             |
-| `halt`            |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |        x        |                    |                   |                             |
-| `read_mem` + `n`  |        x        |              x               |    x    |            |                   |          |    x     |                 |                           |                   |                 |                    |                   |                             |
-| `write_mem` + `n` |        x        |              x               |    x    |            |                   |          |    x     |                 |                           |                   |                 |                    |                   |                             |
-| `hash`            |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `divine_sibling`  |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `assert_vector`   |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `sponge_init`     |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `sponge_absorb`   |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `sponge_squeeze`  |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `add`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `mul`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `invert`          |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |         x         |                 |                    |                   |                             |
-| `eq`              |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `split`           |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `lt`              |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `and`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `xor`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `log_2_floor`     |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |         x         |                 |                    |                   |                             |
-| `pow`             |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |         x          |                   |                             |
-| `div_mod`         |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `pop_count`       |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |         x         |                 |                    |                   |                             |
-| `xxadd`           |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `xxmul`           |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `xinvert`         |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `xbmul`           |                 |                              |    x    |     x      |                   |    x     |          |                 |                           |                   |                 |                    |                   |                             |
-| `read_io` + `n`   |        x        |              x               |         |     x      |                   |          |    x     |                 |             x             |                   |                 |                    |                   |                             |
-| `write_io` + `n`  |        x        |              x               |         |     x      |                   |          |    x     |                 |                           |                   |                 |                    |                   |              x              |
+| instruction       | `decompose_arg` | `prohibit_illegal_num_words` | `no_io` | `no_ram` | `keep_jump_stack` | `step_1` | `step_2` | `grow_op_stack` | `grow_op_stack_by_any_of` | `keep_op_stack_height` | `op_stack_remains_except_top_n_elements_unconstrained` | `keep_op_stack` | `binary_operation` | `shrink_op_stack` | `shrink_op_stack_by_any_of` |
+|:------------------|:---------------:|:----------------------------:|:-------:|:--------:|:-----------------:|:--------:|:--------:|:---------------:|:-------------------------:|:----------------------:|:------------------------------------------------------:|:---------------:|:------------------:|:-----------------:|:---------------------------:|
+| `pop` + `n`       |        x        |              x               |    x    |    x     |                   |          |    x     |                 |                           |                        |                                                        |                 |                    |                   |              x              |
+| `push` + `a`      |                 |                              |    x    |    x     |                   |          |    x     |        x        |                           |                        |                                                        |                 |                    |                   |                             |
+| `divine`  + `n`   |        x        |              x               |    x    |    x     |                   |          |    x     |                 |             x             |                        |                                                        |                 |                    |                   |                             |
+| `dup`  + `i`      |        x        |                              |    x    |    x     |                   |          |    x     |        x        |                           |                        |                                                        |                 |                    |                   |                             |
+| `swap` + `i`      |        x        |                              |    x    |    x     |                   |          |    x     |                 |                           |           x            |                                                        |                 |                    |                   |                             |
+| `nop`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                            0                           |        x        |                    |                   |                             |
+| `skiz`            |                 |                              |    x    |    x     |         x         |          |          |                 |                           |                        |                                                        |                 |                    |         x         |                             |
+| `call` + `d`      |                 |                              |    x    |    x     |                   |          |          |                 |                           |           x            |                            0                           |        x        |                    |                   |                             |
+| `return`          |                 |                              |    x    |    x     |                   |          |          |                 |                           |           x            |                            0                           |        x        |                    |                   |                             |
+| `recurse`         |                 |                              |    x    |    x     |         x         |          |          |                 |                           |           x            |                            0                           |        x        |                    |                   |                             |
+| `assert`          |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |         x         |                             |
+| `halt`            |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                            0                           |        x        |                    |                   |                             |
+| `read_mem` + `n`  |        x        |              x               |    x    |          |                   |          |    x     |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `write_mem` + `n` |        x        |              x               |    x    |          |                   |          |    x     |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `hash`            |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `divine_sibling`  |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `assert_vector`   |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `sponge_init`     |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `sponge_absorb`   |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `sponge_squeeze`  |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `add`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `mul`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `invert`          |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                           1                            |                 |                    |                   |                             |
+| `eq`              |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `split`           |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `lt`              |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `and`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `xor`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `log_2_floor`     |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                           1                            |                 |                    |                   |                             |
+| `pow`             |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |         x          |                   |                             |
+| `div_mod`         |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                           2                            |                 |                    |                   |                             |
+| `pop_count`       |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                           1                            |                 |                    |                   |                             |
+| `xxadd`           |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `xxmul`           |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `xinvert`         |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |           x            |                           3                            |                 |                    |                   |                             |
+| `xbmul`           |                 |                              |    x    |    x     |                   |    x     |          |                 |                           |                        |                                                        |                 |                    |                   |                             |
+| `read_io` + `n`   |        x        |              x               |         |    x     |                   |          |    x     |                 |             x             |                        |                                                        |                 |                    |                   |                             |
+| `write_io` + `n`  |        x        |              x               |         |    x     |                   |          |    x     |                 |                           |                        |                                                        |                 |                    |                   |              x              |
+| `xxdotstep`       |                 |                              |    x    |          |                   |          |          |                 |                           |          x             |                           5                            |                 |                    |                   |                             |
+| `xbdotstep`       |                 |                              |    x    |          |                   |          |          |                 |                           |          x             |                           5                            |                 |                    |                   |              x              |
 
 ## Indicator Polynomials `ind_i(hv3, hv2, hv1, hv0)`
 
@@ -174,7 +178,7 @@ Therefore, the instruction argument's correct binary decomposition is known to b
 1. `RunningEvaluationStandardInput' - RunningEvaluationStandardInput`
 2. `RunningEvaluationStandardOutput' - RunningEvaluationStandardOutput`
 
-## Group `keep_ram`
+## Group `no_ram`
 
 ### Description
 
@@ -347,59 +351,38 @@ Therefore, the instruction's argument `n` correct binary decomposition is known
     else if `n` is 2, then the running product with the Op Stack Table accumulates `st14` and `st15`.
 1. If `n` is 1, then the running product with the Op Stack Table accumulates `st15`.
 
-## Group `unary_operation`
+## Group `keep_op_stack_height`
+
+The op stack pointer and the running product for the [Permutation Argument](permutation-argument.md) with the [Op Stack Table](operational-stack-table.md) remain the same. In other words, there is no access (neither read nor write) from/to the Op Stack Table.
 
 ### Description
 
-1. The stack element in `st1` does not change.
-1. The stack element in `st2` does not change.
-1. The stack element in `st3` does not change.
-1. The stack element in `st4` does not change.
-1. The stack element in `st5` does not change.
-1. The stack element in `st6` does not change.
-1. The stack element in `st7` does not change.
-1. The stack element in `st8` does not change.
-1. The stack element in `st9` does not change.
-1. The stack element in `st10` does not change.
-1. The stack element in `st11` does not change.
-1. The stack element in `st12` does not change.
-1. The stack element in `st13` does not change.
-1. The stack element in `st14` does not change.
-1. The stack element in `st15` does not change.
 1. The op stack pointer does not change.
 1. The running product for the Op Stack Table remains unchanged.
 
 ### Polynomials
 
-1. `st1' - st1`
-1. `st2' - st2`
-1. `st3' - st3`
-1. `st4' - st4`
-1. `st5' - st5`
-1. `st6' - st6`
-1. `st7' - st7`
-1. `st8' - st8`
-1. `st9' - st9`
-1. `st10' - st10`
-1. `st11' - st11`
-1. `st12' - st12`
-1. `st13' - st13`
-1. `st14' - st14`
-1. `st15' - st15`
 1. `op_stack_pointer' - op_stack_pointer`
 1. `RunningProductOpStackTable' - RunningProductOpStackTable`
 
-## Group `keep_op_stack`
 
-Contains all constraints from instruction group `unary_operation`, and additionally:
+## Group `op_stack_remains_except_top_n_elements_unconstrained`
+
+Contains all constraints from group `keep_op_stack_height`, and additionally ensures that all but the top `n` op stack elements remain the same. (The top `n` elements are unconstrained.)
 
 ### Description
 
-1. The stack element in `st0` does not change.
+1. For `i` in `{n, ..., NUM_OP_STACK_REGISTERS-1}` The stack element in `sti` does not change.
 
 ### Polynomials
 
-1. `st0' - st0`
+1. `stn' - stn`
+1. ...
+1. `stN' - stN`
+
+## Group `keep_op_stack`
+
+Short-hand for `op_stack_remains_except_top_n_elements_unconstrained` with `n=0`.
 
 ## Group `binary_operation`
 
diff --git a/specification/src/instruction-specific-transition-constraints.md b/specification/src/instruction-specific-transition-constraints.md
index 1f01d270..6a28be1d 100644
--- a/specification/src/instruction-specific-transition-constraints.md
+++ b/specification/src/instruction-specific-transition-constraints.md
@@ -796,7 +796,6 @@ In addition to its [instruction groups](instruction-groups.md), this instruction
 ### Description
 
 1. Numerator is quotient times denominator plus remainder: `n == q·d + r`.
-1. Stack element `st2` does not change.
 
 ### Polynomials
 
@@ -991,3 +990,25 @@ In addition to its [instruction groups](instruction-groups.md), this instruction
     `+ ind_3(hv3, hv2, hv1, hv0)·(RunningEvaluationStandardOutput' - 🧯·(🧯·(🧯·RunningEvaluationStandardOutput - st0) - st1) - st2)`<br />
     `+ ind_4(hv3, hv2, hv1, hv0)·(RunningEvaluationStandardOutput' - 🧯·(🧯·(🧯·(🧯·RunningEvaluationStandardOutput - st0) - st1) - st2) - st3)`<br />
     `+ ind_5(hv3, hv2, hv1, hv0)·(RunningEvaluationStandardOutput' - 🧯·(🧯·(🧯·(🧯·(🧯·RunningEvaluationStandardOutput - st0) - st1) - st2) - st3) - st4)`
+
+## Instruction `xxdotstep`
+
+In addition to its [instruction groups](instruction-groups.md), this instruction has the following constraints.
+
+### Description
+
+1. Store `(RAM[st0], RAM[st0+1], RAM[st0+2])` in `(hv0, hv1, hv2)`.
+1. Store `(RAM[st1], RAM[st1+1], RAM[st1+2])` in `(hv3, hv4, hv5)`.
+1. Add `(hv0 + hv1·x + hv2·x²) · (hv3 + hv4·x + hv5·x²)` into `(st2, st3, st4)`
+1. Increase the pointers: `st0` and `st1` by 3 each.
+
+## Instruction `xbdotstep`
+
+In addition to its [instruction groups](instruction-groups.md), this instruction has the following constraints.
+
+### Description
+
+1. Store `RAM[st0]` in `hv0`.
+1. Store `(RAM[st1], RAM[st1+1], RAM[st1+2])` in `(hv1, hv2, hv3)`.
+1. Add `hv0 · (hv1 + hv2·x + hv3·x²)` into `(st1, st2, st3)`
+1. Increase the pointers: `st0` and `st1` by 1 and 3, respectively.
\ No newline at end of file
diff --git a/specification/src/instructions.md b/specification/src/instructions.md
index c802cde9..ca1b93e7 100644
--- a/specification/src/instructions.md
+++ b/specification/src/instructions.md
@@ -146,3 +146,10 @@ Triton VM cannot know the number of elements that will be absorbed.
 |:-----------------|-------:|:----------------|:----------------|:-----------------------------------------------------------------------------------------|
 | `read_io` + `n`  |     49 | e.g., `_`       | e.g., `_ c b a` | Reads `n` B-Field elements from standard input and pushes them to the stack. 1 ⩽ `n` ⩽ 5 |
 | `write_io` + `n` |     19 | e.g., `_ c b a` | e.g., `_`       | Pops `n` elements from the stack and writes them to standard output. 1 ⩽ `n` ⩽ 5         |
+
+## Many-In-One
+
+| Instruction | Opcode | old op stack    | new op stack                 | Description |
+|:------------|--------|:----------------|:-----------------------------|-------------|
+| `xxdotstep` | 72     | `_ z y x *b *a` | `_ z+p2 y+p1 x+p0 *b+3 *a+3` | Reads two extension field elements from RAM located at the addresses corresponding to the two top stack elements, multiplies the extension field elements, and adds the product `(p0, p1, p2)` to an accumulator located on stack immediately below the two pointers. Also, increase the pointers by the number of words read. |
+| `xxdotstep` | 80     | `_ z y x *b *a` | `_ z+p2 y+p1 x+p0 *b+3 *a+1` | Reads one base field element from RAM located at the addresses corresponding to the top of the stack, one extension field element from RAM located at the address of the second stack element, multiplies the field elements, and adds the product `(p0, p1, p2)` to an accumulator located on stack immediately below the two pointers. Also, increase the pointers by the number of words read. |
\ No newline at end of file
diff --git a/specification/src/isa.md b/specification/src/isa.md
index 3c42de02..baccb331 100644
--- a/specification/src/isa.md
+++ b/specification/src/isa.md
@@ -5,6 +5,8 @@ It is a [Harvard architecture](https://en.wikipedia.org/wiki/Harvard_architectur
 The arithmetization of the VM is defined over the *B-field* $\mathbb{F}_p$ where $p$ is the Oxfoi prime, _i.e._, $2^{64}-2^{32}+1$.[^oxfoi]
 This means the registers and memory elements take values from $\mathbb{F}_p$, and the transition function gives rise to low-degree transition verification polynomials from the ring of multivariate polynomials over $\mathbb{F}_p$.
 
+At certain points in the course of arithmetization we need an extension field over $\mathbb{F}_p$ to ensure sufficient soundness. To this end we use $\mathbb{F}_{p^3} = \frac{\mathbb{F}_p[X]}{\langle X^3 -X +1 \rangle}$, _i.e._, the quotient ring of remainders of polynomials after division by the *Shah* polynomial, $X^3 -X +1$. We refer to this field as the *X-field* for short, and its elements as *X-field elements*.
+
 Instructions have variable width:
 they either consist of one word, i.e., one B-field element, or of two words, i.e., two B-field elements.
 An example for a single-word instruction is `add`, adding the two elements on top of the stack, leaving the result as the new top of the stack.
diff --git a/specification/src/processor-table.md b/specification/src/processor-table.md
index 0ee6839e..d5f9eab1 100644
--- a/specification/src/processor-table.md
+++ b/specification/src/processor-table.md
@@ -26,7 +26,7 @@ The Processor Table has the following extension columns, corresponding to [Evalu
 1. `RunningEvaluationStandardOutput` for the Evaluation Argument with the output symbols.
 1. `InstructionLookupClientLogDerivative` for the Lookup Argument with the [Program Table](program-table.md)
 1. `RunningProductOpStackTable` for the Permutation Argument with the [Op Stack Table](operational-stack-table.md).
-1. `RunningProductRamTable` for the Permutation Argument with the [RAM Table](random-access-memory-table.md).
+1. `RunningProductRamTable` for the Permutation Argument with the [RAM Table](random-access-memory-table.md). Note that virtual column `instruction_type` holds value 1 for reads and 0 for writes.
 1. `RunningProductJumpStackTable` for the Permutation Argument with the [Jump Stack Table](jump-stack-table.md).
 1. `RunningEvaluationHashInput` for the Evaluation Argument with the [Hash Table](hash-table.md) for copying the input to the hash function from the processor to the hash coprocessor.
 1. `RunningEvaluationHashDigest` for the Evaluation Argument with the [Hash Table](hash-table.md) for copying the hash digest from the hash coprocessor to the processor.
@@ -112,7 +112,7 @@ See [program attestation](program-attestation.md) for more details.
 1. `RunningEvaluationHashDigest` is 1.
 1. `RunningEvaluationSponge` is 1.
 1. `U32LookupClientLogDerivative` is 0.
-1. `ClockJumpDifferenceLookupServerLogDerivative` is 0.
+1. `ClockJumpDifferenceLookupServerLogDerivative` starts having accumulated the first contribution.
 
 ### Initial Constraints as Polynomials
 
@@ -138,14 +138,14 @@ See [program attestation](program-attestation.md) for more details.
 1. `RunningEvaluationStandardOutput - 1`
 1. `InstructionLookupClientLogDerivative · (🪥 - 🥝·ip - 🥥·ci - 🫐·nia) - 1`
 1. `RunningProductOpStackTable - 1`
-1. `RunningProductRamTable - (🛋 - 🍍·clk - 🍈·ramp - 🍎·ramv - 🌽·previous_instruction)`
+1. `RunningProductRamTable - (🛋 - 🍍·clk - 🍈·ramp - 🍎·ramv - 🌽·instruction_type)`
 1. `RunningProductJumpStackTable - (🧴 - 🍇·clk - 🍅·ci - 🍌·jsp - 🍏·jso - 🍐·jsd)`
 1. `(ci - opcode(hash))·(RunningEvaluationHashInput - 1)`<br />
     `+ hash_deselector·(RunningEvaluationHashInput - 🚪 - 🧄₀·st0 - 🧄₁·st1 - 🧄₂·st2 - 🧄₃·st3 - 🧄₄·st4 - 🧄₅·st5 - 🧄₆·st6 - 🧄₇·st7 - 🧄₈·st8 - 🧄₉·st9)`
 1. `RunningEvaluationHashDigest - 1`
 1. `RunningEvaluationSponge - 1`
 1. `U32LookupClientLogDerivative`
-1. `ClockJumpDifferenceLookupServerLogDerivative`
+1. `ClockJumpDifferenceLookupServerLogDerivative · 🪞 - cjd_mul` (Recall that `(🪞 - clk) = 🪞` because `clk = 0`.)
 
 ## Consistency Constraints
 
diff --git a/specification/src/random-access-memory-table.md b/specification/src/random-access-memory-table.md
index f26b8da9..7195968d 100644
--- a/specification/src/random-access-memory-table.md
+++ b/specification/src/random-access-memory-table.md
@@ -166,7 +166,7 @@ Both types of challenges are X-field elements, _i.e._, elements of $\mathbb{F}_{
 1. `bc1 - bcpc1`
 1. `RunningProductOfRAMP - 🧼 + ram_pointer`
 1. `FormalDerivative - 1`
-1. `(RunningProductPermArg - 🛋 - 🍍·clk - 🍈·ram_pointer - 🍎·ram_value - 🌽·previous_instruction)·(instruction_type - 2)`<br />
+1. `(RunningProductPermArg - 🛋 - 🍍·clk - 🍈·ram_pointer - 🍎·ram_value - 🌽·instruction_type)·(instruction_type - 2)`<br />
     `(RunningProductPermArg - 1)·(instruction_type - 1)·(instruction_type - 0)`
 1. `ClockJumpDifferenceLookupClientLogDerivative`
 
@@ -211,7 +211,7 @@ None.
      `+ (ram_pointer' - ram_pointer)·(bc0' - bc0·🧼 - bcpc0')`
 1. `(iord·(ram_pointer' - ram_pointer) - 1)·(bc1' - bc1)`<br />
      `+ (ram_pointer' - ram_pointer)·(bc1' - bc1·🧼 - bcpc1')`
-1. `(RunningProductPermArg' - RunningProductPermArg·(🛋 - 🍍·clk' - 🍈·ram_pointer' - 🍎·ram_value' - 🌽·previous_instruction'))·(instruction_type' - 2)`<br />
+1. `(RunningProductPermArg' - RunningProductPermArg·(🛋 - 🍍·clk' - 🍈·ram_pointer' - 🍎·ram_value' - 🌽·instruction_type'))·(instruction_type' - 2)`<br />
     `(RunningProductPermArg' - RunningProductPermArg)·(instruction_type - 1)·(instruction_type - 0))`
 1. `(iord·(ram_pointer' - ram_pointer) - 1)·(instruction_type' - 2)·((ClockJumpDifferenceLookupClientLogDerivative' - ClockJumpDifferenceLookupClientLogDerivative) · (🪞 - clk' + clk) - 1)`<br />
     `+ (ram_pointer' - ram_pointer)·(ClockJumpDifferenceLookupClientLogDerivative' - ClockJumpDifferenceLookupClientLogDerivative)`<br />
diff --git a/specification/src/registers.md b/specification/src/registers.md
index a878af7e..08367adf 100644
--- a/specification/src/registers.md
+++ b/specification/src/registers.md
@@ -55,6 +55,8 @@ Because they are only needed for some instructions, the helper variables are not
 For instruction group [`decompose_arg`](instruction-groups.md#group-decompose_arg) and instructions
 [`skiz`](instruction-specific-transition-constraints.md#helper-variable-definitions-for-skiz),
 [`divine_sibling`](instruction-specific-transition-constraints.md#helper-variable-definitions-for-divine_sibling),
-[`split`](instruction-specific-transition-constraints.md#helper-variable-definitions-for-split), and
+[`split`](instruction-specific-transition-constraints.md#helper-variable-definitions-for-split),
 [`eq`](instruction-specific-transition-constraints.md#helper-variable-definitions-for-eq),
+[`xxdotstep`](instruction-specific-transition-constraints.md#instruction-xxdotstep), and
+[`xbdotstep`](instruction-specific-transition-constraints.md#instruction-xbdotstep),
 the behavior is defined in the respective sections.
diff --git a/triton-vm/src/instruction.rs b/triton-vm/src/instruction.rs
index 7874e919..44c7f919 100644
--- a/triton-vm/src/instruction.rs
+++ b/triton-vm/src/instruction.rs
@@ -209,6 +209,10 @@ pub enum AnInstruction<Dest: PartialEq + Default> {
     // Read/write
     ReadIo(NumberOfWords),
     WriteIo(NumberOfWords),
+
+    // Many-in-One
+    XxDotStep,
+    XbDotStep,
 }
 
 impl<Dest: PartialEq + Default> AnInstruction<Dest> {
@@ -253,6 +257,8 @@ impl<Dest: PartialEq + Default> AnInstruction<Dest> {
             XbMul => 82,
             ReadIo(_) => 49,
             WriteIo(_) => 19,
+            XxDotStep => 72,
+            XbDotStep => 80,
         }
     }
 
@@ -296,6 +302,8 @@ impl<Dest: PartialEq + Default> AnInstruction<Dest> {
             XbMul => "xbmul",
             ReadIo(_) => "read_io",
             WriteIo(_) => "write_io",
+            XxDotStep => "xxdotstep",
+            XbDotStep => "xbdotstep",
         }
     }
 
@@ -367,6 +375,8 @@ impl<Dest: PartialEq + Default> AnInstruction<Dest> {
             XbMul => XbMul,
             ReadIo(x) => ReadIo(*x),
             WriteIo(x) => WriteIo(*x),
+            XxDotStep => XxDotStep,
+            XbDotStep => XbDotStep,
         }
     }
 
@@ -410,6 +420,8 @@ impl<Dest: PartialEq + Default> AnInstruction<Dest> {
             XbMul => -1,
             ReadIo(n) => n.num_words() as i32,
             WriteIo(n) => -(n.num_words() as i32),
+            XxDotStep => 0,
+            XbDotStep => 0,
         }
     }
 
@@ -560,6 +572,8 @@ const fn all_instructions_without_args() -> [AnInstruction<BFieldElement>; Instr
         XbMul,
         ReadIo(N1),
         WriteIo(N1),
+        XxDotStep,
+        XbDotStep,
     ]
 }
 
diff --git a/triton-vm/src/parser.rs b/triton-vm/src/parser.rs
index 1915f27c..5054c29e 100644
--- a/triton-vm/src/parser.rs
+++ b/triton-vm/src/parser.rs
@@ -285,6 +285,12 @@ fn an_instruction(s: &str) -> ParseResult<AnInstruction<String>> {
 
     let read_write = alt((read_io, write_io));
 
+    // Many-in-One
+    let xxdotstep = instruction("xxdotstep", XxDotStep);
+    let xbdotstep = instruction("xbdotstep", XbDotStep);
+
+    let many_to_one = alt((xxdotstep, xbdotstep));
+
     // Because of common prefixes, the following parsers are sensitive to order:
     //
     // Successfully parsing "assert" before trying "assert_vector" can lead to
@@ -300,6 +306,7 @@ fn an_instruction(s: &str) -> ParseResult<AnInstruction<String>> {
         arithmetic_on_stack,
         read_write,
         syntax_ambiguous,
+        many_to_one,
     ))(s)
 }
 
diff --git a/triton-vm/src/stark.rs b/triton-vm/src/stark.rs
index 954a425a..35df41bb 100644
--- a/triton-vm/src/stark.rs
+++ b/triton-vm/src/stark.rs
@@ -2053,6 +2053,34 @@ pub(crate) mod tests {
         triton_constraints_evaluate_to_zero(property_based_test_program_for_random_ram_access())
     }
 
+    #[test]
+    fn constraints_evaluate_to_zero_on_property_based_test_program_for_xxdotstep() {
+        triton_constraints_evaluate_to_zero(property_based_test_program_for_xxdotstep());
+    }
+
+    #[test]
+    fn constraints_evaluate_to_zero_on_property_based_test_program_for_xbdotstep() {
+        triton_constraints_evaluate_to_zero(property_based_test_program_for_xbdotstep());
+    }
+
+    #[test]
+    fn can_read_twice_from_same_ram_address_within_one_cycle() {
+        for i in 0..=2 {
+            // This program reads from the same address twice, even if the stack
+            // is not well-initialized.
+            let program = triton_program! {
+                dup 0
+                push {i} add
+                xxdotstep
+                halt
+            };
+            let result = program.run(PublicInput::default(), NonDeterminism::default());
+            assert!(result.is_ok());
+            let program_and_input = ProgramAndInput::new(program);
+            triton_constraints_evaluate_to_zero(program_and_input);
+        }
+    }
+
     #[test]
     fn claim_in_ram_corresponds_to_currently_running_program() {
         triton_constraints_evaluate_to_zero(
diff --git a/triton-vm/src/table/constraint_circuit.rs b/triton-vm/src/table/constraint_circuit.rs
index bca7cbba..44154f14 100644
--- a/triton-vm/src/table/constraint_circuit.rs
+++ b/triton-vm/src/table/constraint_circuit.rs
@@ -560,6 +560,8 @@ impl<II: InputIndicator> Eq for ConstraintCircuitMonad<II> {}
 /// Helper function for binary operations that are used to generate new parent nodes in the
 /// multitree that represents the algebraic circuit. Ensures that each newly created node has a
 /// unique ID.
+///
+/// This function does not (currently) catch expressions of the form ((x+1)+1).
 fn binop<II: InputIndicator>(
     binop: BinOp,
     lhs: ConstraintCircuitMonad<II>,
@@ -567,6 +569,27 @@ fn binop<II: InputIndicator>(
 ) -> ConstraintCircuitMonad<II> {
     assert!(lhs.builder.is_same_as(&rhs.builder));
 
+    match (binop, &lhs, &rhs) {
+        (BinOp::Add, _, zero) if zero.circuit.borrow().is_zero() => return lhs,
+        (BinOp::Add, zero, _) if zero.circuit.borrow().is_zero() => return rhs,
+        (BinOp::Mul, _, one) if one.circuit.borrow().is_one() => return lhs,
+        (BinOp::Mul, one, _) if one.circuit.borrow().is_one() => return rhs,
+        (BinOp::Mul, _, zero) if zero.circuit.borrow().is_zero() => return rhs,
+        (BinOp::Mul, zero, _) if zero.circuit.borrow().is_zero() => return lhs,
+        _ => (),
+    };
+
+    match (
+        &lhs.circuit.borrow().expression,
+        &rhs.circuit.borrow().expression,
+    ) {
+        (&BConstant(l), &BConstant(r)) => return lhs.builder.b_constant(binop.operation(l, r)),
+        (&BConstant(l), &XConstant(r)) => return lhs.builder.x_constant(binop.operation(l, r)),
+        (&XConstant(l), &BConstant(r)) => return lhs.builder.x_constant(binop.operation(l, r)),
+        (&XConstant(l), &XConstant(r)) => return lhs.builder.x_constant(binop.operation(l, r)),
+        _ => (),
+    };
+
     // all `BinOp`s are commutative – try both orders of the operands
     let new_node = binop_new_node(binop, &rhs, &lhs);
     if let Some(node) = lhs.builder.all_nodes.borrow().get(&new_node) {
@@ -637,94 +660,98 @@ impl<II: InputIndicator> Sum for ConstraintCircuitMonad<II> {
 }
 
 impl<II: InputIndicator> ConstraintCircuitMonad<II> {
-    pub(crate) fn new_monad_same_context(
-        &self,
-        circuit: Rc<RefCell<ConstraintCircuit<II>>>,
-    ) -> Self {
-        Self {
-            circuit,
-            builder: self.builder.clone(),
-        }
-    }
-
     /// Unwrap a ConstraintCircuitMonad to reveal its inner ConstraintCircuit
     pub fn consume(&self) -> ConstraintCircuit<II> {
         self.circuit.borrow().to_owned()
     }
 
-    fn find_equivalent_expression(&self) -> Option<Rc<RefCell<ConstraintCircuit<II>>>> {
-        let BinaryOperation(op, lhs, rhs) = &self.circuit.borrow().expression else {
-            return None;
-        };
-
-        match (op, lhs, rhs) {
-            (BinOp::Add, c, zero) if zero.borrow().is_zero() => return Some(c.clone()),
-            (BinOp::Add, zero, c) if zero.borrow().is_zero() => return Some(c.clone()),
-            (BinOp::Mul, c, one) if one.borrow().is_one() => return Some(c.clone()),
-            (BinOp::Mul, one, c) if one.borrow().is_one() => return Some(c.clone()),
-            (BinOp::Mul, _, zero) if zero.borrow().is_zero() => return Some(zero.clone()),
-            (BinOp::Mul, zero, _) if zero.borrow().is_zero() => return Some(zero.clone()),
-            _ => (),
-        };
-
-        let new_const = match (&lhs.borrow().expression, &rhs.borrow().expression) {
-            (&BConstant(l), &BConstant(r)) => BConstant(op.operation(l, r)),
-            (&BConstant(l), &XConstant(r)) => XConstant(op.operation(l, r)),
-            (&XConstant(l), &BConstant(r)) => XConstant(op.operation(l, r)),
-            (&XConstant(l), &XConstant(r)) => XConstant(op.operation(l, r)),
-            _ => return None,
-        };
-
-        let new_const = self.builder.make_leaf(new_const).consume();
-        let new_const = Rc::new(RefCell::new(new_const));
-        Some(new_const)
-    }
-
-    /// Apply constant folding to simplify the (sub)tree.
-    /// If the subtree is a leaf: no change.
-    /// If the subtree is a binary operation on:
+    /// Traverse the circuit and find all nodes that are equivalent. Note that
+    /// two nodes are equivalent if they compute the same value on all identical
+    /// inputs. Equivalence is different from identity, which is when two nodes
+    /// connect the same set of neighbors in the same way. (There may be two
+    /// different ways to compute the same result; they are equivalent but
+    /// unequal.)
     ///
-    ///  - constant x constant => fold
-    ///  - anything else       => can't fold
+    /// This function returns a list of lists of equivalent nodes such that
+    /// every inner list can be reduced to a single node without changing the
+    /// circuit's function.
     ///
-    /// This operation mutates self and returns true if a change was applied anywhere in the tree.
-    fn constant_fold_inner(&mut self) -> (bool, Option<Rc<RefCell<ConstraintCircuit<II>>>>) {
-        let BinaryOperation(_, lhs, rhs) = &self.circuit.borrow().expression.clone() else {
-            return (false, None);
-        };
+    /// Equivalent nodes are detected probabilistically using the multivariate
+    /// Schwartz-Zippel lemma. The false positive probability is zero (we can be
+    /// certain that equivalent nodes will be found). The false negative
+    /// probability is bounded by max_degree / (2^64 - 2^32 + 1)^3.
+    pub fn find_equivalent_nodes(&self) -> Vec<Vec<Rc<RefCell<ConstraintCircuit<II>>>>> {
+        let mut id_to_eval = HashMap::new();
+        let mut eval_to_ids = HashMap::new();
+        let mut id_to_node = HashMap::new();
+        Self::probe_random(
+            &self.circuit,
+            &mut id_to_eval,
+            &mut eval_to_ids,
+            &mut id_to_node,
+            rand::random(),
+        );
 
-        let mut change_tracker = false;
-        let mut lhs_as_monadic_value = self.new_monad_same_context(lhs.clone());
-        let (change_in_lhs, _) = lhs_as_monadic_value.constant_fold_inner();
-        change_tracker |= change_in_lhs;
-
-        let mut rhs_as_monadic_value = self.new_monad_same_context(rhs.clone());
-        let (change_in_rhs, _) = rhs_as_monadic_value.constant_fold_inner();
-        change_tracker |= change_in_rhs;
-
-        let equivalent_circuit = self.find_equivalent_expression();
-        if let Some(ref circuit) = equivalent_circuit {
-            change_tracker = true;
-            let id_to_remove = self.circuit.borrow().id;
-            self.builder.substitute(id_to_remove, circuit);
-            self.builder.all_nodes.borrow_mut().remove(self);
-        }
+        eval_to_ids
+            .values()
+            .filter(|ids| ids.len() >= 2)
+            .map(|ids| ids.iter().map(|i| id_to_node[i].clone()).collect_vec())
+            .collect_vec()
+    }
+
+    /// Populate the dictionaries such that they associate with every node in
+    /// the circuit its evaluation in a random point. The inputs are assigned
+    /// random values. Equivalent nodes are detected based on evaluating to the
+    /// same value using the Schwartz-Zippel lemma.
+    fn probe_random(
+        circuit: &Rc<RefCell<ConstraintCircuit<II>>>,
+        id_to_eval: &mut HashMap<usize, XFieldElement>,
+        eval_to_ids: &mut HashMap<XFieldElement, Vec<usize>>,
+        id_to_node: &mut HashMap<usize, Rc<RefCell<ConstraintCircuit<II>>>>,
+        master_seed: XFieldElement,
+    ) -> XFieldElement {
+        const DOMAIN_SEPARATOR_CURR_ROW: BFieldElement = BFieldElement::new(0);
+        const DOMAIN_SEPARATOR_NEXT_ROW: BFieldElement = BFieldElement::new(1);
+        const DOMAIN_SEPARATOR_CHALLENGE: BFieldElement = BFieldElement::new(2);
 
-        (change_tracker, equivalent_circuit)
-    }
+        let circuit_id = circuit.borrow().id;
+        if let Some(&xfe) = id_to_eval.get(&circuit_id) {
+            return xfe;
+        }
 
-    /// Reduce size of multitree by simplifying constant expressions such as `1 * MPol(_,_)`
-    pub fn constant_folding(circuits: &mut [ConstraintCircuitMonad<II>]) {
-        for circuit in circuits {
-            let mut mutated = true;
-            while mutated {
-                let (mutated_inner, maybe_new_root) = circuit.constant_fold_inner();
-                mutated = mutated_inner;
-                if let Some(new_root) = maybe_new_root {
-                    *circuit = circuit.new_monad_same_context(new_root);
-                }
+        let evaluation = match &circuit.borrow().expression {
+            BConstant(bfe) => bfe.lift(),
+            XConstant(xfe) => *xfe,
+            Input(input) => {
+                let [s0, s1, s2] = master_seed.coefficients;
+                let dom_sep = if input.is_current_row() {
+                    DOMAIN_SEPARATOR_CURR_ROW
+                } else {
+                    DOMAIN_SEPARATOR_NEXT_ROW
+                };
+                let i = bfe!(u64::try_from(input.column()).unwrap());
+                let Digest([d0, d1, d2, _, _]) = Tip5::hash_varlen(&[s0, s1, s2, dom_sep, i]);
+                xfe!([d0, d1, d2])
             }
-        }
+            Challenge(challenge) => {
+                let [s0, s1, s2] = master_seed.coefficients;
+                let dom_sep = DOMAIN_SEPARATOR_CHALLENGE;
+                let ch = bfe!(u64::try_from(*challenge).unwrap());
+                let Digest([d0, d1, d2, _, _]) = Tip5::hash_varlen(&[s0, s1, s2, dom_sep, ch]);
+                xfe!([d0, d1, d2])
+            }
+            BinaryOperation(bin_op, lhs, rhs) => {
+                let l = Self::probe_random(lhs, id_to_eval, eval_to_ids, id_to_node, master_seed);
+                let r = Self::probe_random(rhs, id_to_eval, eval_to_ids, id_to_node, master_seed);
+                bin_op.operation(l, r)
+            }
+        };
+
+        id_to_eval.insert(circuit_id, evaluation);
+        eval_to_ids.entry(evaluation).or_default().push(circuit_id);
+        id_to_node.insert(circuit_id, circuit.clone());
+
+        evaluation
     }
 
     /// Lowers the degree of a given multicircuit to the target degree.
@@ -1174,9 +1201,9 @@ mod tests {
         #[strategy(arb())] c: ConstraintCircuitMonad<DualRowIndicator>,
     ) {
         let one = || c.builder.one();
-        check_constant_folding_property(c.clone(), c.clone() * one())?;
-        check_constant_folding_property(c.clone(), one() * c.clone())?;
-        check_constant_folding_property(c.clone(), one() * c.clone() * one())?;
+        prop_assert_eq!(c.clone(), c.clone() * one());
+        prop_assert_eq!(c.clone(), one() * c.clone());
+        prop_assert_eq!(c.clone(), one() * c.clone() * one());
     }
 
     #[proptest]
@@ -1184,16 +1211,16 @@ mod tests {
         #[strategy(arb())] c: ConstraintCircuitMonad<DualRowIndicator>,
     ) {
         let zero = || c.builder.zero();
-        check_constant_folding_property(c.clone(), c.clone() + zero())?;
-        check_constant_folding_property(c.clone(), zero() + c.clone())?;
-        check_constant_folding_property(c.clone(), zero() + c.clone() + zero())?;
+        prop_assert_eq!(c.clone(), c.clone() + zero());
+        prop_assert_eq!(c.clone(), zero() + c.clone());
+        prop_assert_eq!(c.clone(), zero() + c.clone() + zero());
     }
 
     #[proptest]
     fn constant_folding_can_deal_with_subtracting_zero(
         #[strategy(arb())] c: ConstraintCircuitMonad<DualRowIndicator>,
     ) {
-        check_constant_folding_property(c.clone(), c.clone() - c.builder.zero())?;
+        prop_assert_eq!(c.clone(), c.clone() - c.builder.zero());
     }
 
     /// Terribly confusing, super rare bug that's extremely difficult to reproduce or pin down:
@@ -1219,44 +1246,18 @@ mod tests {
             _ => unreachable!(),
         };
 
-        check_constant_folding_property(c, redundant_circuit)?;
-    }
-
-    fn check_constant_folding_property<II: InputIndicator>(
-        circuit: ConstraintCircuitMonad<II>,
-        circuit_with_redundancy: ConstraintCircuitMonad<II>,
-    ) -> Result<(), TestCaseError> {
-        prop_assert_ne!(&circuit, &circuit_with_redundancy);
-        let mut circuits = [circuit, circuit_with_redundancy];
-        ConstraintCircuitMonad::constant_folding(&mut circuits);
-        let [circuit_0, circuit_1] = circuits;
-        prop_assert_eq!(circuit_0, circuit_1);
-        Ok(())
+        prop_assert_eq!(c, redundant_circuit);
     }
 
     #[proptest]
     fn constant_folding_does_not_replace_0_minus_circuit_with_the_circuit(
         #[strategy(arb())] circuit: ConstraintCircuitMonad<DualRowIndicator>,
     ) {
+        if circuit.circuit.borrow().is_zero() {
+            return Err(TestCaseError::Reject("0 - 0 actually is 0".into()));
+        }
         let zero_minus_circuit = circuit.builder.zero() - circuit.clone();
         prop_assert_ne!(&circuit, &zero_minus_circuit);
-
-        let mut circuits = [circuit, zero_minus_circuit];
-        ConstraintCircuitMonad::constant_folding(&mut circuits);
-        let circuit = circuits[0].circuit.borrow().expression.clone();
-        let zero_minus_circuit = circuits[1].circuit.borrow().expression.clone();
-
-        match (circuit, zero_minus_circuit) {
-            // Takes care of special case `circuit == 0`, where `0 - c == c`. Also gives stronger
-            // guarantees if folding reduces circuit to constant, but that's just bonus.
-            (BConstant(l), BConstant(r)) => prop_assert_eq!(l, -r),
-            (XConstant(l), XConstant(r)) => prop_assert_eq!(l, -r),
-            (BConstant(_), XConstant(_)) | (XConstant(_), BConstant(_)) => {
-                let reason = "`circuit` and `0 - circuit` must be of same type";
-                return Err(TestCaseError::fail(reason));
-            }
-            _ => prop_assert_ne!(&circuits[0], &circuits[1]),
-        }
     }
 
     /// Recursively evaluates the given constraint circuit and its sub-circuits on the given
@@ -1351,9 +1352,7 @@ mod tests {
         ) -> Vec<ConstraintCircuitMonad<II>>,
     ) -> Vec<ConstraintCircuitMonad<II>> {
         let circuit_builder = ConstraintCircuitBuilder::new();
-        let mut multicircuit = multicircuit_builder(&circuit_builder);
-        ConstraintCircuitMonad::constant_folding(&mut multicircuit);
-        multicircuit
+        multicircuit_builder(&circuit_builder)
     }
 
     #[test]
@@ -2137,18 +2136,24 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "requires a proper debugging session, or maybe additional optimizations"]
-    fn constraint_circuit_builder_reuses_existing_nodes_when_folding_constants() {
+    fn equivalent_nodes_are_detected_when_present() {
         let builder = ConstraintCircuitBuilder::new();
-        let constant = |c| builder.b_constant(c);
-        let base_row = |r| builder.input(BaseRow(r));
 
-        let c_0 = base_row(0);
-        let c_1 = base_row(0) - constant(0);
-        let mut constraints = [c_0, c_1];
+        let x = |i| builder.input(BaseRow(i));
+        let ch = |i: usize| builder.challenge(i);
 
-        ConstraintCircuitMonad::constant_folding(&mut constraints);
-        let mut constraints = constraints.iter().map(|c| c.consume()).collect_vec();
-        ConstraintCircuit::assert_unique_ids(&mut constraints);
+        let u0 = x(0) + x(1);
+        let u1 = x(2) + x(3);
+        let v = u0 * u1;
+
+        let z0 = x(0) * x(2);
+        let z2 = x(1) * x(3);
+
+        let z1 = x(1) * x(2) + x(0) * x(3);
+        let w = v - z0 - z2;
+        assert!(w.find_equivalent_nodes().is_empty());
+
+        let o = ch(0) * z1 - ch(1) * w;
+        assert!(!o.find_equivalent_nodes().is_empty());
     }
 }
diff --git a/triton-vm/src/table/processor_table.rs b/triton-vm/src/table/processor_table.rs
index f8bba934..8c75a965 100644
--- a/triton-vm/src/table/processor_table.rs
+++ b/triton-vm/src/table/processor_table.rs
@@ -1,12 +1,14 @@
 use std::cmp::max;
 use std::ops::Mul;
 
+use itertools::izip;
 use itertools::Itertools;
 use ndarray::parallel::prelude::*;
 use ndarray::*;
 use num_traits::One;
 use num_traits::Zero;
 use strum::EnumCount;
+use twenty_first::prelude::x_field_element::EXTENSION_DEGREE;
 use twenty_first::prelude::*;
 
 use crate::aet::AlgebraicExecutionTrace;
@@ -16,6 +18,7 @@ use crate::instruction::InstructionBit;
 use crate::instruction::ALL_INSTRUCTIONS;
 use crate::op_stack::NumberOfWords;
 use crate::op_stack::OpStackElement;
+use crate::op_stack::NUM_OP_STACK_REGISTERS;
 use crate::table::challenges::ChallengeId;
 use crate::table::challenges::ChallengeId::*;
 use crate::table::challenges::Challenges;
@@ -45,6 +48,7 @@ impl ProcessorTable {
     ) {
         let num_rows = aet.processor_trace.nrows();
         let mut clk_jump_diff_multiplicities = Array1::zeros([num_rows]);
+
         for clk_jump_diff in clk_jump_diffs_op_stack
             .iter()
             .chain(clk_jump_diffs_ram)
@@ -91,6 +95,7 @@ impl ProcessorTable {
         let num_padding_rows = processor_table.nrows() - processor_table_len;
         let num_padding_rows = bfe!(num_padding_rows as u64);
         let mut row_1 = processor_table.row_mut(1);
+
         row_1[ClockJumpDifferenceLookupMultiplicity.base_table_index()] += num_padding_rows;
     }
 
@@ -112,7 +117,7 @@ impl ProcessorTable {
         let mut hash_digest_running_evaluation = EvalArg::default_initial();
         let mut sponge_running_evaluation = EvalArg::default_initial();
         let mut u32_table_running_sum_log_derivative = LookupArg::default_initial();
-        let mut clock_jump_diff_lookup_op_stack_log_derivative = LookupArg::default_initial();
+        let mut clock_jump_diff_lookup_log_derivative = LookupArg::default_initial();
 
         let mut previous_row: Option<ArrayView1<BFieldElement>> = None;
         for row_idx in 0..base_table.nrows() {
@@ -154,12 +159,14 @@ impl ProcessorTable {
                     .inverse();
             }
 
+            // OpStack Table
             op_stack_table_running_product *= Self::factor_for_op_stack_table_running_product(
                 previous_row,
                 current_row,
                 challenges,
             );
 
+            // RAM Table
             if let Some(factor) =
                 Self::factor_for_ram_table_running_product(previous_row, current_row, challenges)
             {
@@ -313,7 +320,7 @@ impl ProcessorTable {
             // Lookup Argument for clock jump differences
             let lookup_multiplicity =
                 current_row[ClockJumpDifferenceLookupMultiplicity.base_table_index()];
-            clock_jump_diff_lookup_op_stack_log_derivative +=
+            clock_jump_diff_lookup_log_derivative +=
                 (challenges[ClockJumpDifferenceLookupIndeterminate] - clk).inverse()
                     * lookup_multiplicity;
 
@@ -331,7 +338,7 @@ impl ProcessorTable {
             extension_row[U32LookupClientLogDerivative.ext_table_index()] =
                 u32_table_running_sum_log_derivative;
             extension_row[ClockJumpDifferenceLookupServerLogDerivative.ext_table_index()] =
-                clock_jump_diff_lookup_op_stack_log_derivative;
+                clock_jump_diff_lookup_log_derivative;
             previous_row = Some(current_row);
         }
     }
@@ -401,37 +408,80 @@ impl ProcessorTable {
         let previous_row = maybe_previous_row?;
         let instruction = Self::instruction_from_row(previous_row)?;
 
+        let clk = previous_row[CLK.base_table_index()];
         let instruction_type = match instruction {
             ReadMem(_) => ram_table::INSTRUCTION_TYPE_READ,
             WriteMem(_) => ram_table::INSTRUCTION_TYPE_WRITE,
+            XxDotStep => ram_table::INSTRUCTION_TYPE_READ,
+            XbDotStep => ram_table::INSTRUCTION_TYPE_READ,
             _ => return None,
         };
+        let mut accesses = vec![];
 
-        // longer stack means relevant information is on top of stack, i.e., in stack registers
-        let row_with_longer_stack = match instruction {
-            ReadMem(_) => current_row.view(),
-            WriteMem(_) => previous_row.view(),
+        match instruction {
+            ReadMem(_) | WriteMem(_) => {
+                // longer stack means relevant information is on top of stack, i.e., in stack registers
+                let row_with_longer_stack = match instruction {
+                    ReadMem(_) => current_row.view(),
+                    WriteMem(_) => previous_row.view(),
+                    _ => unreachable!(),
+                };
+                let op_stack_delta = instruction.op_stack_size_influence().unsigned_abs() as usize;
+
+                let num_ram_pointers = 1;
+                for ram_pointer_offset in 0..op_stack_delta {
+                    let ram_value_index = ram_pointer_offset + num_ram_pointers;
+                    let ram_value_column = Self::op_stack_column_by_index(ram_value_index);
+                    let ram_value = row_with_longer_stack[ram_value_column.base_table_index()];
+                    let offset_ram_pointer = Self::offset_ram_pointer(
+                        instruction,
+                        row_with_longer_stack,
+                        ram_pointer_offset,
+                    );
+                    accesses.push((offset_ram_pointer, ram_value));
+                }
+            }
+            XxDotStep => {
+                let rhs_pointer = previous_row[ST0.base_table_index()];
+                let lhs_pointer = previous_row[ST1.base_table_index()];
+                let hv0 = previous_row[HV0.base_table_index()];
+                let hv1 = previous_row[HV1.base_table_index()];
+                let hv2 = previous_row[HV2.base_table_index()];
+                let hv3 = previous_row[HV3.base_table_index()];
+                let hv4 = previous_row[HV4.base_table_index()];
+                let hv5 = previous_row[HV5.base_table_index()];
+                accesses.push((rhs_pointer, hv0));
+                accesses.push((rhs_pointer + bfe!(1), hv1));
+                accesses.push((rhs_pointer + bfe!(2), hv2));
+                accesses.push((lhs_pointer, hv3));
+                accesses.push((lhs_pointer + bfe!(1), hv4));
+                accesses.push((lhs_pointer + bfe!(2), hv5));
+            }
+            XbDotStep => {
+                let rhs_pointer = previous_row[ST0.base_table_index()];
+                let lhs_pointer = previous_row[ST1.base_table_index()];
+                let hv0 = previous_row[HV0.base_table_index()];
+                let hv1 = previous_row[HV1.base_table_index()];
+                let hv2 = previous_row[HV2.base_table_index()];
+                let hv3 = previous_row[HV3.base_table_index()];
+                accesses.push((rhs_pointer, hv0));
+                accesses.push((lhs_pointer, hv1));
+                accesses.push((lhs_pointer + bfe!(1), hv2));
+                accesses.push((lhs_pointer + bfe!(2), hv3));
+            }
             _ => unreachable!(),
         };
-        let op_stack_delta = instruction.op_stack_size_influence().unsigned_abs() as usize;
-
-        let mut factor = xfe!(1);
-        for ram_pointer_offset in 0..op_stack_delta {
-            let num_ram_pointers = 1;
-            let ram_value_index = ram_pointer_offset + num_ram_pointers;
-            let ram_value_column = Self::op_stack_column_by_index(ram_value_index);
-            let ram_value = row_with_longer_stack[ram_value_column.base_table_index()];
-            let offset_ram_pointer =
-                Self::offset_ram_pointer(instruction, row_with_longer_stack, ram_pointer_offset);
 
-            let clk = previous_row[CLK.base_table_index()];
-            let compressed_row = clk * challenges[RamClkWeight]
-                + instruction_type * challenges[RamInstructionTypeWeight]
-                + offset_ram_pointer * challenges[RamPointerWeight]
-                + ram_value * challenges[RamValueWeight];
-            factor *= challenges[RamIndeterminate] - compressed_row;
-        }
-        Some(factor)
+        accesses
+            .into_iter()
+            .map(|(ramp, ramv)| {
+                clk * challenges[RamClkWeight]
+                    + instruction_type * challenges[RamInstructionTypeWeight]
+                    + ramp * challenges[RamPointerWeight]
+                    + ramv * challenges[RamValueWeight]
+            })
+            .map(|compressed_row| challenges[RamIndeterminate] - compressed_row)
+            .reduce(|l, r| l * r)
     }
 
     fn offset_ram_pointer(
@@ -579,10 +629,15 @@ impl ExtProcessorTable {
                     * (jump_stack_indeterminate - compressed_row_for_jump_stack_table);
 
         // clock jump difference lookup argument
-        // A clock jump difference of 0 is illegal. Hence, the initial is recorded.
+        // The clock jump difference logarithmic derivative accumulator starts
+        // off having accumulated the contribution from the first row.
+        // Note that (challenge(ClockJumpDifferenceLookupIndeterminate) - base_row(CLK))
+        // collapses to challenge(ClockJumpDifferenceLookupIndeterminate)
+        // because base_row(CLK) = 0 is already a constraint.
         let clock_jump_diff_lookup_log_derivative_is_initialized_correctly =
             ext_row(ClockJumpDifferenceLookupServerLogDerivative)
-                - x_constant(LookupArg::default_initial());
+                * challenge(ClockJumpDifferenceLookupIndeterminate)
+                - base_row(ClockJumpDifferenceLookupMultiplicity);
 
         // from processor to hash table
         let hash_selector = base_row(CI) - constant(Instruction::Hash.opcode());
@@ -814,7 +869,7 @@ impl ExtProcessorTable {
             ],
             Self::instruction_group_keep_jump_stack(circuit_builder),
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat();
@@ -872,7 +927,9 @@ impl ExtProcessorTable {
         ]
     }
 
-    fn instruction_group_keep_ram(
+    /// The permutation argument accumulator with the RAM table does
+    /// not change, because there is no RAM access.
+    fn instruction_group_no_ram(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
         let curr_ext_row = |col: ProcessorExtTableColumn| {
@@ -894,8 +951,11 @@ impl ExtProcessorTable {
         ]
     }
 
-    fn instruction_group_op_stack_remains_and_top_three_elements_unconstrained(
+    /// Op Stack height does not change and except for the top n elements,
+    /// the values remain also.
+    fn instruction_group_op_stack_remains_except_top_n(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
+        n: usize,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
         let curr_base_row = |col: ProcessorBaseTableColumn| {
             circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
@@ -904,55 +964,26 @@ impl ExtProcessorTable {
             circuit_builder.input(NextBaseRow(col.master_base_table_index()))
         };
 
-        let curr_ext_row = |col: ProcessorExtTableColumn| {
-            circuit_builder.input(CurrentExtRow(col.master_ext_table_index()))
-        };
-        let next_ext_row = |col: ProcessorExtTableColumn| {
-            circuit_builder.input(NextExtRow(col.master_ext_table_index()))
-        };
+        let all_but_n_top_elements_remain = (n..NUM_OP_STACK_REGISTERS)
+            .map(ProcessorTable::op_stack_column_by_index)
+            .map(|sti| next_base_row(sti) - curr_base_row(sti))
+            .collect_vec();
+        let ram_perm_arg_remains = Self::instruction_group_keep_op_stack_height(circuit_builder);
 
-        vec![
-            next_base_row(ST3) - curr_base_row(ST3),
-            next_base_row(ST4) - curr_base_row(ST4),
-            next_base_row(ST5) - curr_base_row(ST5),
-            next_base_row(ST6) - curr_base_row(ST6),
-            next_base_row(ST7) - curr_base_row(ST7),
-            next_base_row(ST8) - curr_base_row(ST8),
-            next_base_row(ST9) - curr_base_row(ST9),
-            next_base_row(ST10) - curr_base_row(ST10),
-            next_base_row(ST11) - curr_base_row(ST11),
-            next_base_row(ST12) - curr_base_row(ST12),
-            next_base_row(ST13) - curr_base_row(ST13),
-            next_base_row(ST14) - curr_base_row(ST14),
-            next_base_row(ST15) - curr_base_row(ST15),
-            next_base_row(OpStackPointer) - curr_base_row(OpStackPointer),
-            next_ext_row(OpStackTablePermArg) - curr_ext_row(OpStackTablePermArg),
-        ]
+        [all_but_n_top_elements_remain, ram_perm_arg_remains].concat()
     }
 
-    fn instruction_group_unop(
+    /// Op stack does not change, _i.e._, all stack elements persist
+    fn instruction_group_keep_op_stack(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
-        let curr_base_row = |col: ProcessorBaseTableColumn| {
-            circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
-        };
-        let next_base_row = |col: ProcessorBaseTableColumn| {
-            circuit_builder.input(NextBaseRow(col.master_base_table_index()))
-        };
-
-        let specific_constraints = vec![
-            next_base_row(ST1) - curr_base_row(ST1),
-            next_base_row(ST2) - curr_base_row(ST2),
-        ];
-        let inherited_constraints =
-            Self::instruction_group_op_stack_remains_and_top_three_elements_unconstrained(
-                circuit_builder,
-            );
-
-        [specific_constraints, inherited_constraints].concat()
+        Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 0)
     }
 
-    fn instruction_group_keep_op_stack(
+    /// Op stack *height* does not change, _i.e._, the accumulator for the
+    /// permutation argument with the op stack table remains the same as does
+    /// the op stack pointer.
+    fn instruction_group_keep_op_stack_height(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
         let curr_base_row = |col: ProcessorBaseTableColumn| {
@@ -961,11 +992,18 @@ impl ExtProcessorTable {
         let next_base_row = |col: ProcessorBaseTableColumn| {
             circuit_builder.input(NextBaseRow(col.master_base_table_index()))
         };
-
-        let specific_constraints = vec![next_base_row(ST0) - curr_base_row(ST0)];
-        let inherited_constraints = Self::instruction_group_unop(circuit_builder);
-
-        [specific_constraints, inherited_constraints].concat()
+        let curr_ext_row = |col: ProcessorExtTableColumn| {
+            circuit_builder.input(CurrentExtRow(col.master_ext_table_index()))
+        };
+        let next_ext_row = |col: ProcessorExtTableColumn| {
+            circuit_builder.input(NextExtRow(col.master_ext_table_index()))
+        };
+        vec![
+            // permutation argument accumulator does not change
+            next_ext_row(OpStackTablePermArg) - curr_ext_row(OpStackTablePermArg),
+            // op stack pointer does not change
+            next_base_row(OpStackPointer) - curr_base_row(OpStackPointer),
+        ]
     }
 
     fn instruction_group_grow_op_stack_and_top_two_elements_unconstrained(
@@ -1106,6 +1144,7 @@ impl ExtProcessorTable {
         ]
     }
 
+    /// Increase the instruction pointer by 1.
     fn instruction_group_step_1(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
@@ -1126,6 +1165,7 @@ impl ExtProcessorTable {
         .concat()
     }
 
+    /// Increase the instruction pointer by 2.
     fn instruction_group_step_2(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
@@ -1262,7 +1302,7 @@ impl ExtProcessorTable {
             Self::instruction_group_decompose_arg(circuit_builder),
             Self::stack_shrinks_by_any_of(circuit_builder, &NumberOfWords::legal_values()),
             Self::prohibit_any_illegal_number_of_words(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1283,7 +1323,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_grow_op_stack(circuit_builder),
             Self::instruction_group_step_2(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1297,7 +1337,7 @@ impl ExtProcessorTable {
             Self::instruction_group_decompose_arg(circuit_builder),
             Self::stack_grows_by_any_of(circuit_builder, &NumberOfWords::legal_values()),
             Self::prohibit_any_illegal_number_of_words(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1337,7 +1377,7 @@ impl ExtProcessorTable {
             Self::instruction_group_decompose_arg(circuit_builder),
             Self::instruction_group_step_2(circuit_builder),
             Self::instruction_group_grow_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1351,15 +1391,9 @@ impl ExtProcessorTable {
         let curr_base_row = |col: ProcessorBaseTableColumn| {
             circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
         };
-        let curr_ext_row = |col: ProcessorExtTableColumn| {
-            circuit_builder.input(CurrentExtRow(col.master_ext_table_index()))
-        };
         let next_base_row = |col: ProcessorBaseTableColumn| {
             circuit_builder.input(NextBaseRow(col.master_base_table_index()))
         };
-        let next_ext_row = |col: ProcessorExtTableColumn| {
-            circuit_builder.input(NextExtRow(col.master_ext_table_index()))
-        };
 
         let specific_constraints = vec![
             indicator_poly(0),
@@ -1408,15 +1442,14 @@ impl ExtProcessorTable {
             (one() - indicator_poly(13)) * (next_base_row(ST13) - curr_base_row(ST13)),
             (one() - indicator_poly(14)) * (next_base_row(ST14) - curr_base_row(ST14)),
             (one() - indicator_poly(15)) * (next_base_row(ST15) - curr_base_row(ST15)),
-            next_base_row(OpStackPointer) - curr_base_row(OpStackPointer),
-            next_ext_row(OpStackTablePermArg) - curr_ext_row(OpStackTablePermArg),
         ];
         [
             specific_constraints,
             Self::instruction_group_decompose_arg(circuit_builder),
             Self::instruction_group_step_2(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
+            Self::instruction_group_keep_op_stack_height(circuit_builder),
         ]
         .concat()
     }
@@ -1427,7 +1460,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1487,7 +1520,7 @@ impl ExtProcessorTable {
             Self::next_instruction_range_check_constraints_for_instruction_skiz(circuit_builder),
             Self::instruction_group_keep_jump_stack(circuit_builder),
             Self::instruction_group_shrink_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1551,7 +1584,7 @@ impl ExtProcessorTable {
         [
             specific_constraints,
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1578,7 +1611,7 @@ impl ExtProcessorTable {
         [
             specific_constraints,
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1601,7 +1634,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_keep_jump_stack(circuit_builder),
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1623,7 +1656,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_shrink_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1647,7 +1680,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1705,7 +1738,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             op_stack_shrinks_by_5_and_top_5_unconstrained,
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1771,7 +1804,7 @@ impl ExtProcessorTable {
             maybe_shift_known_digest_down_the_stack,
             op_stack_grows_by_5_and_top_11_elements_unconstrained,
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1795,7 +1828,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::constraints_for_shrinking_stack_by(circuit_builder, 5),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1807,7 +1840,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_keep_op_stack(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1819,7 +1852,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::constraints_for_shrinking_stack_by(circuit_builder, 10),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1831,7 +1864,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::constraints_for_growing_stack_by(circuit_builder, 10),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1853,7 +1886,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1875,7 +1908,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1897,8 +1930,8 @@ impl ExtProcessorTable {
         [
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_unop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 1),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1940,7 +1973,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -1988,7 +2021,7 @@ impl ExtProcessorTable {
                 circuit_builder,
             ),
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2000,7 +2033,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2012,7 +2045,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2024,7 +2057,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2035,8 +2068,8 @@ impl ExtProcessorTable {
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
         [
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_unop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 1),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2048,7 +2081,7 @@ impl ExtProcessorTable {
         [
             Self::instruction_group_step_1(circuit_builder),
             Self::instruction_group_binop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2068,19 +2101,12 @@ impl ExtProcessorTable {
         let numerator_is_quotient_times_denominator_plus_remainder =
             curr_base_row(ST0) - curr_base_row(ST1) * next_base_row(ST1) - next_base_row(ST0);
 
-        let st2_does_not_change = next_base_row(ST2) - curr_base_row(ST2);
-
-        let specific_constraints = vec![
-            numerator_is_quotient_times_denominator_plus_remainder,
-            st2_does_not_change,
-        ];
+        let specific_constraints = vec![numerator_is_quotient_times_denominator_plus_remainder];
         [
             specific_constraints,
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_op_stack_remains_and_top_three_elements_unconstrained(
-                circuit_builder,
-            ),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 2),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2091,8 +2117,8 @@ impl ExtProcessorTable {
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
         [
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_unop(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 1),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2121,7 +2147,7 @@ impl ExtProcessorTable {
             specific_constraints,
             Self::constraints_for_shrinking_stack_by_3_and_top_3_unconstrained(circuit_builder),
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2137,31 +2163,19 @@ impl ExtProcessorTable {
             circuit_builder.input(NextBaseRow(col.master_base_table_index()))
         };
 
-        let st0_becomes_coefficient_0 = next_base_row(ST0)
-            - (curr_base_row(ST0) * curr_base_row(ST3)
-                - curr_base_row(ST2) * curr_base_row(ST4)
-                - curr_base_row(ST1) * curr_base_row(ST5));
-        let st1_becomes_coefficient_1 = next_base_row(ST1)
-            - (curr_base_row(ST1) * curr_base_row(ST3) + curr_base_row(ST0) * curr_base_row(ST4)
-                - curr_base_row(ST2) * curr_base_row(ST5)
-                + curr_base_row(ST2) * curr_base_row(ST4)
-                + curr_base_row(ST1) * curr_base_row(ST5));
-        let st2_becomes_coefficient_2 = next_base_row(ST2)
-            - (curr_base_row(ST2) * curr_base_row(ST3)
-                + curr_base_row(ST1) * curr_base_row(ST4)
-                + curr_base_row(ST0) * curr_base_row(ST5)
-                + curr_base_row(ST2) * curr_base_row(ST5));
+        let [x0, x1, x2, y0, y1, y2] = [ST0, ST1, ST2, ST3, ST4, ST5].map(curr_base_row);
+        let [c0, c1, c2] = Self::xx_product([x0, x1, x2], [y0, y1, y2]);
 
         let specific_constraints = vec![
-            st0_becomes_coefficient_0,
-            st1_becomes_coefficient_1,
-            st2_becomes_coefficient_2,
+            next_base_row(ST0) - c0,
+            next_base_row(ST1) - c1,
+            next_base_row(ST2) - c2,
         ];
         [
             specific_constraints,
             Self::constraints_for_shrinking_stack_by_3_and_top_3_unconstrained(circuit_builder),
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2203,11 +2217,9 @@ impl ExtProcessorTable {
         ];
         [
             specific_constraints,
-            Self::instruction_group_op_stack_remains_and_top_three_elements_unconstrained(
-                circuit_builder,
-            ),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 3),
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2223,17 +2235,13 @@ impl ExtProcessorTable {
             circuit_builder.input(NextBaseRow(col.master_base_table_index()))
         };
 
-        let first_coeff_scalar_multiplication =
-            next_base_row(ST0) - curr_base_row(ST0) * curr_base_row(ST1);
-        let secnd_coeff_scalar_multiplication =
-            next_base_row(ST1) - curr_base_row(ST0) * curr_base_row(ST2);
-        let third_coeff_scalar_multiplication =
-            next_base_row(ST2) - curr_base_row(ST0) * curr_base_row(ST3);
+        let [x, y0, y1, y2] = [ST0, ST1, ST2, ST3].map(curr_base_row);
+        let [c0, c1, c2] = Self::xb_product([y0, y1, y2], x);
 
         let specific_constraints = vec![
-            first_coeff_scalar_multiplication,
-            secnd_coeff_scalar_multiplication,
-            third_coeff_scalar_multiplication,
+            next_base_row(ST0) - c0,
+            next_base_row(ST1) - c1,
+            next_base_row(ST2) - c2,
         ];
         [
             specific_constraints,
@@ -2241,7 +2249,7 @@ impl ExtProcessorTable {
                 circuit_builder,
             ),
             Self::instruction_group_step_1(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             Self::instruction_group_no_io(circuit_builder),
         ]
         .concat()
@@ -2263,7 +2271,7 @@ impl ExtProcessorTable {
             Self::instruction_group_decompose_arg(circuit_builder),
             read_any_legal_number_of_words,
             Self::prohibit_any_illegal_number_of_words(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             vec![Self::running_evaluation_for_standard_output_remains_unchanged(circuit_builder)],
         ]
         .concat()
@@ -2285,12 +2293,184 @@ impl ExtProcessorTable {
             Self::instruction_group_decompose_arg(circuit_builder),
             write_any_of_1_through_5_elements,
             Self::prohibit_any_illegal_number_of_words(circuit_builder),
-            Self::instruction_group_keep_ram(circuit_builder),
+            Self::instruction_group_no_ram(circuit_builder),
             vec![Self::running_evaluation_for_standard_input_remains_unchanged(circuit_builder)],
         ]
         .concat()
     }
 
+    /// Update the accumulator for the Permutation Argument with the RAM table in
+    /// accordance with reading a bunch of words from the indicated ram pointers to
+    /// the indicated destination registers.
+    ///
+    /// Does not constrain the op stack by default.[^stack] For that, see:
+    /// [`Self::read_from_ram_any_of`].
+    ///
+    /// [^stack]: Op stack registers used in arguments will be constrained.
+    fn read_from_ram_to<const N: usize>(
+        circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
+        ram_pointers: [ConstraintCircuitMonad<DualRowIndicator>; N],
+        destinations: [ConstraintCircuitMonad<DualRowIndicator>; N],
+    ) -> ConstraintCircuitMonad<DualRowIndicator> {
+        let curr_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
+        };
+        let curr_ext_row = |col: ProcessorExtTableColumn| {
+            circuit_builder.input(CurrentExtRow(col.master_ext_table_index()))
+        };
+        let next_ext_row = |col: ProcessorExtTableColumn| {
+            circuit_builder.input(NextExtRow(col.master_ext_table_index()))
+        };
+        let challenge = |c: ChallengeId| circuit_builder.challenge(c);
+        let constant = |bfe| circuit_builder.b_constant(bfe);
+
+        let compress_row = |(ram_pointer, destination)| {
+            curr_base_row(CLK) * challenge(RamClkWeight)
+                + constant(ram_table::INSTRUCTION_TYPE_READ) * challenge(RamInstructionTypeWeight)
+                + ram_pointer * challenge(RamPointerWeight)
+                + destination * challenge(RamValueWeight)
+        };
+
+        let factor = ram_pointers
+            .into_iter()
+            .zip(destinations)
+            .map(compress_row)
+            .map(|compressed_row| challenge(RamIndeterminate) - compressed_row)
+            .reduce(|l, r| l * r)
+            .unwrap_or_else(|| constant(bfe!(1)));
+        curr_ext_row(RamTablePermArg) * factor - next_ext_row(RamTablePermArg)
+    }
+
+    fn xx_product<Indicator: InputIndicator>(
+        [x_0, x_1, x_2]: [ConstraintCircuitMonad<Indicator>; EXTENSION_DEGREE],
+        [y_0, y_1, y_2]: [ConstraintCircuitMonad<Indicator>; EXTENSION_DEGREE],
+    ) -> [ConstraintCircuitMonad<Indicator>; EXTENSION_DEGREE] {
+        let z0 = x_0.clone() * y_0.clone();
+        let z1 = x_1.clone() * y_0.clone() + x_0.clone() * y_1.clone();
+        let z2 = x_2.clone() * y_0 + x_1.clone() * y_1.clone() + x_0 * y_2.clone();
+        let z3 = x_2.clone() * y_1 + x_1 * y_2.clone();
+        let z4 = x_2 * y_2;
+
+        // reduce modulo x³ - x + 1
+        [z0 - z3.clone(), z1 - z4.clone() + z3, z2 + z4]
+    }
+
+    fn xb_product<Indicator: InputIndicator>(
+        [x_0, x_1, x_2]: [ConstraintCircuitMonad<Indicator>; EXTENSION_DEGREE],
+        y: ConstraintCircuitMonad<Indicator>,
+    ) -> [ConstraintCircuitMonad<Indicator>; EXTENSION_DEGREE] {
+        let z0 = x_0 * y.clone();
+        let z1 = x_1 * y.clone();
+        let z2 = x_2 * y;
+        [z0, z1, z2]
+    }
+
+    fn update_dotstep_accumulator(
+        circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
+        accumulator_indices: [ProcessorBaseTableColumn; EXTENSION_DEGREE],
+        difference: [ConstraintCircuitMonad<DualRowIndicator>; EXTENSION_DEGREE],
+    ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
+        let curr_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
+        };
+        let next_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(NextBaseRow(col.master_base_table_index()))
+        };
+        let curr = accumulator_indices.map(curr_base_row);
+        let next = accumulator_indices.map(next_base_row);
+        izip!(curr, next, difference)
+            .map(|(c, n, d)| n - c - d)
+            .collect()
+    }
+
+    fn instruction_xxdotstep(
+        circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
+    ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
+        let curr_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
+        };
+        let next_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(NextBaseRow(col.master_base_table_index()))
+        };
+        let constant = |c| circuit_builder.b_constant(c);
+
+        let increment_ram_pointer_st0 = next_base_row(ST0) - curr_base_row(ST0) - constant(3);
+        let increment_ram_pointer_st1 = next_base_row(ST1) - curr_base_row(ST1) - constant(3);
+
+        let rhs_ptr0 = curr_base_row(ST0);
+        let rhs_ptr1 = rhs_ptr0.clone() + constant(1);
+        let rhs_ptr2 = rhs_ptr0.clone() + constant(2);
+        let lhs_ptr0 = curr_base_row(ST1);
+        let lhs_ptr1 = lhs_ptr0.clone() + constant(1);
+        let lhs_ptr2 = lhs_ptr0.clone() + constant(2);
+        let ram_read_sources = [rhs_ptr0, rhs_ptr1, rhs_ptr2, lhs_ptr0, lhs_ptr1, lhs_ptr2];
+        let ram_read_destinations = [HV0, HV1, HV2, HV3, HV4, HV5].map(curr_base_row);
+        let read_two_xfes_from_ram =
+            Self::read_from_ram_to(circuit_builder, ram_read_sources, ram_read_destinations);
+
+        let ram_pointer_constraints = vec![
+            increment_ram_pointer_st0,
+            increment_ram_pointer_st1,
+            read_two_xfes_from_ram,
+        ];
+
+        let [hv0, hv1, hv2, hv3, hv4, hv5] = [HV0, HV1, HV2, HV3, HV4, HV5].map(curr_base_row);
+        let hv_product = Self::xx_product([hv0, hv1, hv2], [hv3, hv4, hv5]);
+
+        [
+            ram_pointer_constraints,
+            Self::update_dotstep_accumulator(circuit_builder, [ST2, ST3, ST4], hv_product),
+            Self::instruction_group_step_1(circuit_builder),
+            Self::instruction_group_no_io(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 5),
+            Self::instruction_group_keep_jump_stack(circuit_builder),
+        ]
+        .concat()
+    }
+
+    fn instruction_xbdotstep(
+        circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
+    ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
+        let curr_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(CurrentBaseRow(col.master_base_table_index()))
+        };
+        let next_base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(NextBaseRow(col.master_base_table_index()))
+        };
+        let constant = |c| circuit_builder.b_constant(c);
+
+        let increment_ram_pointer_st0 = next_base_row(ST0) - curr_base_row(ST0) - constant(1);
+        let increment_ram_pointer_st1 = next_base_row(ST1) - curr_base_row(ST1) - constant(3);
+
+        let rhs_ptr0 = curr_base_row(ST0);
+        let lhs_ptr0 = curr_base_row(ST1);
+        let lhs_ptr1 = lhs_ptr0.clone() + constant(1);
+        let lhs_ptr2 = lhs_ptr0.clone() + constant(2);
+        let ram_read_sources = [rhs_ptr0, lhs_ptr0, lhs_ptr1, lhs_ptr2];
+        let ram_read_destinations = [HV0, HV1, HV2, HV3].map(curr_base_row);
+        let read_bfe_and_xfe_from_ram =
+            Self::read_from_ram_to(circuit_builder, ram_read_sources, ram_read_destinations);
+
+        let ram_pointer_constraints = vec![
+            increment_ram_pointer_st0,
+            increment_ram_pointer_st1,
+            read_bfe_and_xfe_from_ram,
+        ];
+
+        let [hv0, hv1, hv2, hv3] = [HV0, HV1, HV2, HV3].map(curr_base_row);
+        let hv_product = Self::xb_product([hv1, hv2, hv3], hv0);
+
+        [
+            ram_pointer_constraints,
+            Self::update_dotstep_accumulator(circuit_builder, [ST2, ST3, ST4], hv_product),
+            Self::instruction_group_step_1(circuit_builder),
+            Self::instruction_group_no_io(circuit_builder),
+            Self::instruction_group_op_stack_remains_except_top_n(circuit_builder, 5),
+            Self::instruction_group_keep_jump_stack(circuit_builder),
+        ]
+        .concat()
+    }
+
     fn get_transition_constraints_for_instruction(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
         instruction: Instruction,
@@ -2329,14 +2509,18 @@ impl ExtProcessorTable {
             DivMod => ExtProcessorTable::instruction_div_mod(circuit_builder),
             PopCount => ExtProcessorTable::instruction_pop_count(circuit_builder),
             XxAdd => ExtProcessorTable::instruction_xxadd(circuit_builder),
+
             XxMul => ExtProcessorTable::instruction_xxmul(circuit_builder),
             XInvert => ExtProcessorTable::instruction_xinv(circuit_builder),
             XbMul => ExtProcessorTable::instruction_xbmul(circuit_builder),
             ReadIo(_) => ExtProcessorTable::instruction_read_io(circuit_builder),
             WriteIo(_) => ExtProcessorTable::instruction_write_io(circuit_builder),
+            XxDotStep => ExtProcessorTable::instruction_xxdotstep(circuit_builder),
+            XbDotStep => ExtProcessorTable::instruction_xbdotstep(circuit_builder),
         }
     }
 
+    /// Constrains instruction argument `nia` such that 0 < nia <= 5.
     fn prohibit_any_illegal_number_of_words(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
     ) -> Vec<ConstraintCircuitMonad<DualRowIndicator>> {
@@ -2779,6 +2963,8 @@ impl ExtProcessorTable {
         challenge(OpStackIndeterminate) - compressed_row
     }
 
+    /// Build constraints for popping `n` elements from the top of the stack and
+    /// writing them to RAM. The reciprocal of [`Self::read_from_ram_any_of`].
     fn write_to_ram_any_of(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
         number_of_words: &[usize],
@@ -2792,6 +2978,11 @@ impl ExtProcessorTable {
         Self::combine_mutually_exclusive_constraint_groups(circuit_builder, all_constraint_groups)
     }
 
+    /// Build constraints for reading `n` elements from RAM and putting them on top
+    /// of the stack. The reciprocal of [`Self::write_to_ram_any_of`].
+    ///
+    /// To constrain RAM reads with more flexible target locations, see
+    /// [`Self::read_from_ram_to`].
     fn read_from_ram_any_of(
         circuit_builder: &ConstraintCircuitBuilder<DualRowIndicator>,
         number_of_words: &[usize],
@@ -3369,6 +3560,7 @@ pub(crate) mod tests {
     use assert2::assert;
     use ndarray::Array2;
     use proptest::collection::vec;
+    use proptest::prop_assert_eq;
     use proptest_arbitrary_interop::arb;
     use rand::thread_rng;
     use rand::Rng;
@@ -4204,4 +4396,64 @@ pub(crate) mod tests {
             &challenges,
         );
     }
+
+    #[proptest]
+    fn xx_product_is_accurate(
+        #[strategy(arb())] a: XFieldElement,
+        #[strategy(arb())] b: XFieldElement,
+    ) {
+        let circuit_builder = ConstraintCircuitBuilder::new();
+        let base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(BaseRow(col.master_base_table_index()))
+        };
+        let [x0, x1, x2, y0, y1, y2] = [ST0, ST1, ST2, ST3, ST4, ST5].map(base_row);
+
+        let mut base_table = Array2::zeros([1, NUM_BASE_COLUMNS]);
+        let ext_table = Array2::zeros([1, NUM_EXT_COLUMNS]);
+        let challenges = Challenges::default().challenges;
+        base_table[[0, ST0.master_base_table_index()]] = a.coefficients[0];
+        base_table[[0, ST1.master_base_table_index()]] = a.coefficients[1];
+        base_table[[0, ST2.master_base_table_index()]] = a.coefficients[2];
+        base_table[[0, ST3.master_base_table_index()]] = b.coefficients[0];
+        base_table[[0, ST4.master_base_table_index()]] = b.coefficients[1];
+        base_table[[0, ST5.master_base_table_index()]] = b.coefficients[2];
+
+        let [c0, c1, c2] = ExtProcessorTable::xx_product([x0, x1, x2], [y0, y1, y2])
+            .map(|c| c.consume())
+            .map(|c| c.evaluate(base_table.view(), ext_table.view(), &challenges));
+
+        let c = a * b;
+        prop_assert_eq!(c.coefficients[0], c0.coefficients[0]);
+        prop_assert_eq!(c.coefficients[1], c1.coefficients[0]);
+        prop_assert_eq!(c.coefficients[2], c2.coefficients[0]);
+    }
+
+    #[proptest]
+    fn xb_product_is_accurate(
+        #[strategy(arb())] a: XFieldElement,
+        #[strategy(arb())] b: BFieldElement,
+    ) {
+        let circuit_builder = ConstraintCircuitBuilder::new();
+        let base_row = |col: ProcessorBaseTableColumn| {
+            circuit_builder.input(BaseRow(col.master_base_table_index()))
+        };
+        let [x0, x1, x2, y] = [ST0, ST1, ST2, ST3].map(base_row);
+
+        let mut base_table = Array2::zeros([1, NUM_BASE_COLUMNS]);
+        let ext_table = Array2::zeros([1, NUM_EXT_COLUMNS]);
+        let challenges = Challenges::default().challenges;
+        base_table[[0, ST0.master_base_table_index()]] = a.coefficients[0];
+        base_table[[0, ST1.master_base_table_index()]] = a.coefficients[1];
+        base_table[[0, ST2.master_base_table_index()]] = a.coefficients[2];
+        base_table[[0, ST3.master_base_table_index()]] = b;
+
+        let [c0, c1, c2] = ExtProcessorTable::xb_product([x0, x1, x2], y)
+            .map(|c| c.consume())
+            .map(|c| c.evaluate(base_table.view(), ext_table.view(), &challenges));
+
+        let c = a * b;
+        prop_assert_eq!(c.coefficients[0], c0.coefficients[0]);
+        prop_assert_eq!(c.coefficients[1], c1.coefficients[0]);
+        prop_assert_eq!(c.coefficients[2], c2.coefficients[0]);
+    }
 }
diff --git a/triton-vm/src/table/ram_table.rs b/triton-vm/src/table/ram_table.rs
index 50ed6f76..2a1f2acb 100644
--- a/triton-vm/src/table/ram_table.rs
+++ b/triton-vm/src/table/ram_table.rs
@@ -80,6 +80,7 @@ impl RamTable {
         for (row_index, row) in sorted_rows.enumerate() {
             ram_table.row_mut(row_index).assign(&row);
         }
+
         let all_ram_pointers = ram_table.column(RamPointer.base_table_index());
         let unique_ram_pointers = all_ram_pointers.iter().unique().copied().collect_vec();
         let (bezout_0, bezout_1) =
@@ -180,7 +181,6 @@ impl RamTable {
             let clk_diff = next_row[CLK.base_table_index()] - curr_row[CLK.base_table_index()];
 
             if ramp_diff.is_zero() {
-                assert!(!clk_diff.is_zero(), "row_idx = {row_idx}");
                 clock_jump_differences.push(clk_diff);
             } else {
                 current_bcpc_0 = bezout_coefficient_polynomial_coefficients_0.pop().unwrap();
diff --git a/triton-vm/src/vm.rs b/triton-vm/src/vm.rs
index 28832045..0a49d495 100644
--- a/triton-vm/src/vm.rs
+++ b/triton-vm/src/vm.rs
@@ -11,6 +11,7 @@ use ndarray::Array1;
 use num_traits::One;
 use num_traits::Zero;
 use serde_derive::*;
+use twenty_first::math::x_field_element::EXTENSION_DEGREE;
 use twenty_first::prelude::*;
 use twenty_first::util_types::algebraic_hasher::Domain;
 
@@ -135,6 +136,7 @@ impl VMState {
             return hvs;
         };
 
+        let ram_read = |address| self.ram.get(&address).copied().unwrap_or_else(|| bfe!(0));
         match current_instruction {
             Pop(_) | Divine(_) | Dup(_) | Swap(_) | ReadMem(_) | WriteMem(_) | ReadIo(_)
             | WriteIo(_) => {
@@ -166,6 +168,20 @@ impl VMState {
                 }
             }
             Eq => hvs[0] = (self.op_stack[ST1] - self.op_stack[ST0]).inverse_or_zero(),
+            XxDotStep => {
+                hvs[0] = ram_read(self.op_stack[ST0]);
+                hvs[1] = ram_read(self.op_stack[ST0] + bfe!(1));
+                hvs[2] = ram_read(self.op_stack[ST0] + bfe!(2));
+                hvs[3] = ram_read(self.op_stack[ST1]);
+                hvs[4] = ram_read(self.op_stack[ST1] + bfe!(1));
+                hvs[5] = ram_read(self.op_stack[ST1] + bfe!(2));
+            }
+            XbDotStep => {
+                hvs[0] = ram_read(self.op_stack[ST0]);
+                hvs[1] = ram_read(self.op_stack[ST1]);
+                hvs[2] = ram_read(self.op_stack[ST1] + bfe!(1));
+                hvs[3] = ram_read(self.op_stack[ST1] + bfe!(2));
+            }
             _ => (),
         }
 
@@ -234,6 +250,8 @@ impl VMState {
             XbMul => self.xb_mul()?,
             WriteIo(n) => self.write_io(n)?,
             ReadIo(n) => self.read_io(n)?,
+            XxDotStep => self.xx_dot_step()?,
+            XbDotStep => self.xb_dot_step()?,
         };
         let op_stack_calls = self.stop_recording_op_stack_calls();
         co_processor_calls.extend(op_stack_calls);
@@ -759,6 +777,47 @@ impl VMState {
         Ok(vec![])
     }
 
+    fn xx_dot_step(&mut self) -> Result<Vec<CoProcessorCall>> {
+        self.start_recording_ram_calls();
+        let mut rhs_address = self.op_stack.pop()?;
+        let mut lhs_address = self.op_stack.pop()?;
+        let mut rhs = xfe!(0);
+        let mut lhs = xfe!(0);
+        for i in 0..EXTENSION_DEGREE {
+            rhs.coefficients[i] = self.ram_read(rhs_address);
+            rhs_address.increment();
+            lhs.coefficients[i] = self.ram_read(lhs_address);
+            lhs_address.increment();
+        }
+        let accumulator = self.op_stack.pop_extension_field_element()? + rhs * lhs;
+        self.op_stack.push_extension_field_element(accumulator);
+        self.op_stack.push(lhs_address);
+        self.op_stack.push(rhs_address);
+        self.instruction_pointer += 1;
+        let ram_calls = self.stop_recording_ram_calls();
+        Ok(ram_calls)
+    }
+
+    fn xb_dot_step(&mut self) -> Result<Vec<CoProcessorCall>> {
+        self.start_recording_ram_calls();
+        let mut rhs_address = self.op_stack.pop()?;
+        let mut lhs_address = self.op_stack.pop()?;
+        let rhs = self.ram_read(rhs_address);
+        rhs_address.increment();
+        let mut lhs = xfe!(0);
+        for i in 0..EXTENSION_DEGREE {
+            lhs.coefficients[i] = self.ram_read(lhs_address);
+            lhs_address.increment();
+        }
+        let accumulator = self.op_stack.pop_extension_field_element()? + rhs * lhs;
+        self.op_stack.push_extension_field_element(accumulator);
+        self.op_stack.push(lhs_address);
+        self.op_stack.push(rhs_address);
+        self.instruction_pointer += 1;
+        let ram_calls = self.stop_recording_ram_calls();
+        Ok(ram_calls)
+    }
+
     pub fn to_processor_row(&self) -> Array1<BFieldElement> {
         use crate::instruction::InstructionBit;
         use ProcessorBaseTableColumn::*;
@@ -1029,6 +1088,8 @@ pub(crate) mod tests {
 
     use assert2::assert;
     use assert2::let_assert;
+    use itertools::izip;
+    use proptest::collection::vec;
     use proptest::prelude::*;
     use proptest_arbitrary_interop::arb;
     use rand::prelude::IteratorRandom;
@@ -1050,6 +1111,7 @@ pub(crate) mod tests {
     use crate::shared_tests::LeavedMerkleTreeTestData;
     use crate::shared_tests::ProgramAndInput;
     use crate::triton_asm;
+    use crate::triton_instr;
     use crate::triton_program;
 
     use super::*;
@@ -1689,6 +1751,153 @@ pub(crate) mod tests {
         source_code_and_input.run().unwrap();
     }
 
+    #[test]
+    fn can_compute_dot_product_from_uninitialized_ram() {
+        let program = triton_program!(xxdotstep xbdotstep halt);
+        program
+            .run(PublicInput::default(), NonDeterminism::default())
+            .unwrap();
+    }
+
+    pub(crate) fn property_based_test_program_for_xxdotstep() -> ProgramAndInput {
+        let mut rng = ThreadRng::default();
+        let n = rng.gen_range(0..10);
+
+        let push_xfe = |xfe: XFieldElement| {
+            let [c_0, c_1, c_2] = xfe.coefficients;
+            triton_asm! { push {c_2} push {c_1} push {c_0} }
+        };
+        let push_and_write_xfe = |xfe| {
+            let push_xfe = push_xfe(xfe);
+            triton_asm! {
+                {&push_xfe}
+                dup 3
+                write_mem 3
+                swap 1
+                pop 1
+            }
+        };
+        let into_write_instructions = |elements: Vec<_>| {
+            elements
+                .into_iter()
+                .flat_map(push_and_write_xfe)
+                .collect_vec()
+        };
+
+        let vector_one = (0..n).map(|_| rng.gen()).collect_vec();
+        let vector_two = (0..n).map(|_| rng.gen()).collect_vec();
+        let inner_product = vector_one
+            .iter()
+            .zip(&vector_two)
+            .map(|(&a, &b)| a * b)
+            .sum();
+        let push_inner_product = push_xfe(inner_product);
+
+        let push_and_write_vector_one = into_write_instructions(vector_one);
+        let push_and_write_vector_two = into_write_instructions(vector_two);
+        let many_dotsteps = (0..n).map(|_| triton_instr!(xxdotstep)).collect_vec();
+
+        let code = triton_program! {
+            push 0
+            {&push_and_write_vector_one}
+            dup 0
+            {&push_and_write_vector_two}
+            pop 1
+            push 0
+
+            {&many_dotsteps}
+            pop 2
+            push 0 push 0
+
+            {&push_inner_product}
+            push 0 push 0
+            assert_vector
+            halt
+        };
+        ProgramAndInput::new(code)
+    }
+
+    /// Sanity check
+    #[test]
+    fn run_dont_prove_property_based_test_program_for_xxdotstep() {
+        let source_code_and_input = property_based_test_program_for_xxdotstep();
+        source_code_and_input.run().unwrap();
+    }
+
+    pub(crate) fn property_based_test_program_for_xbdotstep() -> ProgramAndInput {
+        let mut rng = ThreadRng::default();
+        let n = rng.gen_range(0..10);
+        let push_xfe = |x: XFieldElement| {
+            triton_asm! {
+                push {x.coefficients[2]}
+                push {x.coefficients[1]}
+                push {x.coefficients[0]}
+            }
+        };
+        let push_and_write_xfe = |x: XFieldElement| {
+            triton_asm! {
+                push {x.coefficients[2]}
+                push {x.coefficients[1]}
+                push {x.coefficients[0]}
+                dup 3
+                write_mem 3
+                swap 1
+                pop 1
+            }
+        };
+        let push_and_write_bfe = |x: BFieldElement| {
+            triton_asm! {
+                push {x}
+                dup 1
+                write_mem 1
+                swap 1
+                pop 1
+            }
+        };
+        let vector_one = (0..n).map(|_| rng.gen::<XFieldElement>()).collect_vec();
+        let vector_two = (0..n).map(|_| rng.gen::<BFieldElement>()).collect_vec();
+        let inner_product = vector_one
+            .iter()
+            .zip(vector_two.iter())
+            .map(|(&a, &b)| a * b)
+            .sum::<XFieldElement>();
+        let push_and_write_vector_one = (0..n)
+            .flat_map(|i| push_and_write_xfe(vector_one[i]))
+            .collect_vec();
+        let push_and_write_vector_two = (0..n)
+            .flat_map(|i| push_and_write_bfe(vector_two[i]))
+            .collect_vec();
+        let push_inner_product = push_xfe(inner_product);
+        let many_dotsteps = (0..n).map(|_| triton_instr!(xbdotstep)).collect_vec();
+        let code = triton_program! {
+            push 0
+            {&push_and_write_vector_one}
+            dup 0
+            {&push_and_write_vector_two}
+            pop 1
+            push 0
+            swap 1
+            {&many_dotsteps}
+            pop 1
+            pop 1
+            push 0
+            push 0
+            {&push_inner_product}
+            push 0
+            push 0
+            assert_vector
+            halt
+        };
+        ProgramAndInput::new(code)
+    }
+
+    /// Sanity check
+    #[test]
+    fn run_dont_prove_property_based_test_program_for_xbdotstep() {
+        let source_code_and_input = property_based_test_program_for_xbdotstep();
+        source_code_and_input.run().unwrap();
+    }
+
     #[proptest]
     fn negative_property_is_u32(
         #[strategy(arb())]
@@ -2284,4 +2493,123 @@ pub(crate) mod tests {
         let deserialized = serde_json::from_str(&serialized).unwrap();
         prop_assert_eq!(vm_state, deserialized);
     }
+
+    #[proptest]
+    fn xxdotstep(
+        #[strategy(0_usize..=25)] n: usize,
+        #[strategy(vec(arb(), #n))] lhs: Vec<XFieldElement>,
+        #[strategy(vec(arb(), #n))] rhs: Vec<XFieldElement>,
+        #[strategy(arb())] lhs_address: BFieldElement,
+        #[strategy(arb())] rhs_address: BFieldElement,
+    ) {
+        let mem_region_size = (n * EXTENSION_DEGREE) as u64;
+        let mem_region = |addr: BFieldElement| addr.value()..addr.value() + mem_region_size;
+        let right_in_left = mem_region(lhs_address).contains(&rhs_address.value());
+        let left_in_right = mem_region(rhs_address).contains(&lhs_address.value());
+        if right_in_left || left_in_right {
+            let reason = "storing into overlapping regions would overwrite";
+            return Err(TestCaseError::Reject(reason.into()));
+        }
+
+        let lhs_flat = lhs.iter().flat_map(|&xfe| xfe.coefficients).collect_vec();
+        let rhs_flat = rhs.iter().flat_map(|&xfe| xfe.coefficients).collect_vec();
+        let mut ram = HashMap::new();
+        for (i, &l, &r) in izip!(0.., &lhs_flat, &rhs_flat) {
+            ram.insert(lhs_address + bfe!(i), l);
+            ram.insert(rhs_address + bfe!(i), r);
+        }
+
+        let public_input = PublicInput::default();
+        let secret_input = NonDeterminism::default().with_ram(ram);
+        let many_xxdotsteps = triton_asm![xxdotstep; n];
+        let program = triton_program! {
+            push 0 push 0 push 0
+
+            push {lhs_address}
+            push {rhs_address}
+
+            {&many_xxdotsteps}
+            halt
+        };
+
+        let mut vmstate = VMState::new(&program, public_input, secret_input);
+        prop_assert!(vmstate.run().is_ok());
+
+        prop_assert_eq!(
+            vmstate.op_stack.pop().unwrap(),
+            rhs_address + bfe!(rhs_flat.len() as u64)
+        );
+        prop_assert_eq!(
+            vmstate.op_stack.pop().unwrap(),
+            lhs_address + bfe!(lhs_flat.len() as u64)
+        );
+
+        let observed_dot_product = vmstate.op_stack.pop_extension_field_element().unwrap();
+        let expected_dot_product = lhs
+            .into_iter()
+            .zip(rhs)
+            .map(|(l, r)| l * r)
+            .sum::<XFieldElement>();
+        prop_assert_eq!(expected_dot_product, observed_dot_product);
+    }
+
+    #[proptest]
+    fn xbdotstep(
+        #[strategy(0_usize..=25)] n: usize,
+        #[strategy(vec(arb(), #n))] lhs: Vec<XFieldElement>,
+        #[strategy(vec(arb(), #n))] rhs: Vec<BFieldElement>,
+        #[strategy(arb())] lhs_address: BFieldElement,
+        #[strategy(arb())] rhs_address: BFieldElement,
+    ) {
+        let mem_region_size_lhs = (n * EXTENSION_DEGREE) as u64;
+        let mem_region_lhs = lhs_address.value()..lhs_address.value() + mem_region_size_lhs;
+        let mem_region_rhs = rhs_address.value()..rhs_address.value() + n as u64;
+        let right_in_left = mem_region_lhs.contains(&rhs_address.value());
+        let left_in_right = mem_region_rhs.contains(&lhs_address.value());
+        if right_in_left || left_in_right {
+            let reason = "storing into overlapping regions would overwrite";
+            return Err(TestCaseError::Reject(reason.into()));
+        }
+
+        let lhs_flat = lhs.iter().flat_map(|&xfe| xfe.coefficients).collect_vec();
+        let mut ram = HashMap::new();
+        for (i, &l) in (0..).zip(&lhs_flat) {
+            ram.insert(lhs_address + bfe!(i), l);
+        }
+        for (i, &r) in (0..).zip(&rhs) {
+            ram.insert(rhs_address + bfe!(i), r);
+        }
+
+        let public_input = PublicInput::default();
+        let secret_input = NonDeterminism::default().with_ram(ram);
+        let many_xbdotsteps = triton_asm![xbdotstep; n];
+        let program = triton_program! {
+            push 0 push 0 push 0
+
+            push {lhs_address}
+            push {rhs_address}
+
+            {&many_xbdotsteps}
+            halt
+        };
+
+        let mut vmstate = VMState::new(&program, public_input, secret_input);
+        prop_assert!(vmstate.run().is_ok());
+
+        prop_assert_eq!(
+            vmstate.op_stack.pop().unwrap(),
+            rhs_address + bfe!(rhs.len() as u64)
+        );
+        prop_assert_eq!(
+            vmstate.op_stack.pop().unwrap(),
+            lhs_address + bfe!(lhs_flat.len() as u64)
+        );
+        let observed_dot_product = vmstate.op_stack.pop_extension_field_element().unwrap();
+        let expected_dot_product = lhs
+            .into_iter()
+            .zip(rhs)
+            .map(|(l, r)| l * r)
+            .sum::<XFieldElement>();
+        prop_assert_eq!(expected_dot_product, observed_dot_product);
+    }
 }