From 19da2b0b09de5033c835daf07d2f9e71d1d3f954 Mon Sep 17 00:00:00 2001
From: Yen-Fu Chen <qwe661234@gmail.com>
Date: Mon, 19 Dec 2022 21:23:54 +0800
Subject: [PATCH] Reduce instruction dispatch by tail-call elimination

We adhere to the wasm3 implementation, which separates all instruction
emulations, and organize them into a funciton table. After doing
performance analysis, we discovered that emulator took a long time to
calculate the offset of function table. We therefore alter struct
rv_insn_t so that we can directly assign instruction emulation to IR
with adding member opfunc.

Running coremark benchmark now produces faster results than it did
previously, and the test results show below.

| Microprocessor | compiler | CoreMark w/ commit f2da162 | CoreMark w/ PR sysprog21#95 | Speedup |
|------------------------------------------------------------------------------------------------|
| Core i7-8700   | clang-15 |        836.4849530         |         971.9516670         | +13.9%  |
|------------------------------------------------------------------------------------------------|
| Core i7-8700   | gcc-12   |        888.3423808         |         963.3369450         | +7.8%   |
|------------------------------------------------------------------------------------------------|
| eMag 8180      | clang-15 |        286.0007652         |         335.396515          | +20.5%  |
|------------------------------------------------------------------------------------------------|
| eMag 8180      | gcc-12   |        259.6389222         |         332.561175          | +14.0%  |

Previously, we had to calculate the jumping address using a method such
as switch-case, computed-goto, or function table, but this is no longer
necessary.
---
 Makefile      |    4 +
 src/common.h  |   10 +
 src/decode.h  |  270 +++----
 src/emulate.c | 1921 ++++++++++++++++++++++++-------------------------
 4 files changed, 1083 insertions(+), 1122 deletions(-)

diff --git a/Makefile b/Makefile
index 4d87da712..259396871 100644
--- a/Makefile
+++ b/Makefile
@@ -88,6 +88,10 @@ gdbstub-test: $(BIN)
 	$(Q)tests/gdbstub.sh && $(call notice, [OK])
 endif
 
+# For tail-call elimination, we need a specific set of build flags applied.
+# FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact.
+$(OUT)/emulate.o: CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector
+
 # Clear the .DEFAULT_GOAL special variable, so that the following turns
 # to the first target after .DEFAULT_GOAL is not set.
 .DEFAULT_GOAL :=
diff --git a/src/common.h b/src/common.h
index 348764125..e59816f3d 100644
--- a/src/common.h
+++ b/src/common.h
@@ -24,6 +24,16 @@
 #define __ALIGNED(x)
 #endif
 
+/* There is no tail-call optimization(TCO) in non-optimized builds. To work
+ * around this, we attempts to use a compiler attribute called musttail that
+ * forces the compiler to TCO even when optimizations aren't on.
+ */
+#if defined(__has_attribute) && __has_attribute(musttail)
+#define MUST_TAIL __attribute__((musttail))
+#else
+#define MUST_TAIL
+#endif
+
 /* Pattern Matching for C macros.
  * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
  */
diff --git a/src/decode.h b/src/decode.h
index 084f41309..350426727 100644
--- a/src/decode.h
+++ b/src/decode.h
@@ -8,158 +8,160 @@
 #include <stdbool.h>
 #include <stdint.h>
 
+#include "riscv.h"
+
 /* RISC-V instruction list */
 /* clang-format off */
 #define RISCV_INSN_LIST                    \
-    _(nop)                                 \
+    _(nop, 0)                              \
     /* RV32I Base Instruction Set */       \
-    _(lui)                                 \
-    _(auipc)                               \
-    _(jal)                                 \
-    _(jalr)                                \
-    _(beq)                                 \
-    _(bne)                                 \
-    _(blt)                                 \
-    _(bge)                                 \
-    _(bltu)                                \
-    _(bgeu)                                \
-    _(lb)                                  \
-    _(lh)                                  \
-    _(lw)                                  \
-    _(lbu)                                 \
-    _(lhu)                                 \
-    _(sb)                                  \
-    _(sh)                                  \
-    _(sw)                                  \
-    _(addi)                                \
-    _(slti)                                \
-    _(sltiu)                               \
-    _(xori)                                \
-    _(ori)                                 \
-    _(andi)                                \
-    _(slli)                                \
-    _(srli)                                \
-    _(srai)                                \
-    _(add)                                 \
-    _(sub)                                 \
-    _(sll)                                 \
-    _(slt)                                 \
-    _(sltu)                                \
-    _(xor)                                 \
-    _(srl)                                 \
-    _(sra)                                 \
-    _(or)                                  \
-    _(and)                                 \
-    _(ecall)                               \
-    _(ebreak)                              \
+    _(lui, 0)                              \
+    _(auipc, 0)                            \
+    _(jal, 1)                              \
+    _(jalr, 1)                             \
+    _(beq, 1)                              \
+    _(bne, 1)                              \
+    _(blt, 1)                              \
+    _(bge, 1)                              \
+    _(bltu, 1)                             \
+    _(bgeu, 1)                             \
+    _(lb, 0)                               \
+    _(lh, 0)                               \
+    _(lw, 0)                               \
+    _(lbu, 0)                              \
+    _(lhu, 0)                              \
+    _(sb, 0)                               \
+    _(sh, 0)                               \
+    _(sw, 0)                               \
+    _(addi, 0)                             \
+    _(slti, 0)                             \
+    _(sltiu, 0)                            \
+    _(xori, 0)                             \
+    _(ori, 0)                              \
+    _(andi, 0)                             \
+    _(slli, 0)                             \
+    _(srli, 0)                             \
+    _(srai, 0)                             \
+    _(add, 0)                              \
+    _(sub, 0)                              \
+    _(sll, 0)                              \
+    _(slt, 0)                              \
+    _(sltu, 0)                             \
+    _(xor, 0)                              \
+    _(srl, 0)                              \
+    _(sra, 0)                              \
+    _(or, 0)                               \
+    _(and, 0)                              \
+    _(ecall, 1)                            \
+    _(ebreak, 1)                           \
     /* RISC-V Privileged Instruction */    \
-    _(wfi)                                 \
-    _(uret)                                \
-    _(sret)                                \
-    _(hret)                                \
-    _(mret)                                \
+    _(wfi, 0)                              \
+    _(uret, 0)                             \
+    _(sret, 0)                             \
+    _(hret, 0)                             \
+    _(mret, 1)                             \
     /* RV32 Zifencei Standard Extension */ \
     IIF(RV32_HAS(Zifencei))(               \
-        _(fencei)                          \
+        _(fencei, 0)                       \
     )                                      \
     /* RV32 Zicsr Standard Extension */    \
     IIF(RV32_HAS(Zicsr))(                  \
-        _(csrrw)                           \
-        _(csrrs)                           \
-        _(csrrc)                           \
-        _(csrrwi)                          \
-        _(csrrsi)                          \
-        _(csrrci)                          \
+        _(csrrw, 0)                        \
+        _(csrrs, 0)                        \
+        _(csrrc, 0)                        \
+        _(csrrwi, 0)                       \
+        _(csrrsi, 0)                       \
+        _(csrrci, 0)                       \
     )                                      \
     /* RV32M Standard Extension */         \
     IIF(RV32_HAS(EXT_M))(                  \
-        _(mul)                             \
-        _(mulh)                            \
-        _(mulhsu)                          \
-        _(mulhu)                           \
-        _(div)                             \
-        _(divu)                            \
-        _(rem)                             \
-        _(remu)                            \
+        _(mul, 0)                          \
+        _(mulh, 0)                         \
+        _(mulhsu, 0)                       \
+        _(mulhu, 0)                        \
+        _(div, 0)                          \
+        _(divu, 0)                         \
+        _(rem, 0)                          \
+        _(remu, 0)                         \
     )                                      \
     /* RV32A Standard Extension */         \
     IIF(RV32_HAS(EXT_A))(                  \
-        _(lrw)                             \
-        _(scw)                             \
-        _(amoswapw)                        \
-        _(amoaddw)                         \
-        _(amoxorw)                         \
-        _(amoandw)                         \
-        _(amoorw)                          \
-        _(amominw)                         \
-        _(amomaxw)                         \
-        _(amominuw)                        \
-        _(amomaxuw)                        \
+        _(lrw, 0)                          \
+        _(scw, 0)                          \
+        _(amoswapw, 0)                     \
+        _(amoaddw, 0)                      \
+        _(amoxorw, 0)                      \
+        _(amoandw, 0)                      \
+        _(amoorw, 0)                       \
+        _(amominw, 0)                      \
+        _(amomaxw, 0)                      \
+        _(amominuw, 0)                     \
+        _(amomaxuw, 0)                     \
     )                                      \
     /* RV32F Standard Extension */         \
     IIF(RV32_HAS(EXT_F))(                  \
-        _(flw)                             \
-        _(fsw)                             \
-        _(fmadds)                          \
-        _(fmsubs)                          \
-        _(fnmsubs)                         \
-        _(fnmadds)                         \
-        _(fadds)                           \
-        _(fsubs)                           \
-        _(fmuls)                           \
-        _(fdivs)                           \
-        _(fsqrts)                          \
-        _(fsgnjs)                          \
-        _(fsgnjns)                         \
-        _(fsgnjxs)                         \
-        _(fmins)                           \
-        _(fmaxs)                           \
-        _(fcvtws)                          \
-        _(fcvtwus)                         \
-        _(fmvxw)                           \
-        _(feqs)                            \
-        _(flts)                            \
-        _(fles)                            \
-        _(fclasss)                         \
-        _(fcvtsw)                          \
-        _(fcvtswu)                         \
-        _(fmvwx)                           \
+        _(flw, 0)                          \
+        _(fsw, 0)                          \
+        _(fmadds, 0)                       \
+        _(fmsubs, 0)                       \
+        _(fnmsubs, 0)                      \
+        _(fnmadds, 0)                      \
+        _(fadds, 0)                        \
+        _(fsubs, 0)                        \
+        _(fmuls, 0)                        \
+        _(fdivs, 0)                        \
+        _(fsqrts, 0)                       \
+        _(fsgnjs, 0)                       \
+        _(fsgnjns, 0)                      \
+        _(fsgnjxs, 0)                      \
+        _(fmins, 0)                        \
+        _(fmaxs, 0)                        \
+        _(fcvtws, 0)                       \
+        _(fcvtwus, 0)                      \
+        _(fmvxw, 0)                        \
+        _(feqs, 0)                         \
+        _(flts, 0)                         \
+        _(fles, 0)                         \
+        _(fclasss, 0)                      \
+        _(fcvtsw, 0)                       \
+        _(fcvtswu, 0)                      \
+        _(fmvwx, 0)                        \
     )                                      \
     /* RV32C Standard Extension */         \
     IIF(RV32_HAS(EXT_C))(                  \
-        _(caddi4spn)                       \
-        _(clw)                             \
-        _(csw)                             \
-        _(cnop)                            \
-        _(caddi)                           \
-        _(cjal)                            \
-        _(cli)                             \
-        _(caddi16sp)                       \
-        _(clui)                            \
-        _(csrli)                           \
-        _(csrai)                           \
-        _(candi)                           \
-        _(csub)                            \
-        _(cxor)                            \
-        _(cor)                             \
-        _(cand)                            \
-        _(cj)                              \
-        _(cbeqz)                           \
-        _(cbnez)                           \
-        _(cslli)                           \
-        _(clwsp)                           \
-        _(cjr)                             \
-        _(cmv)                             \
-        _(cebreak)                         \
-        _(cjalr)                           \
-        _(cadd)                            \
-        _(cswsp)                           \
+        _(caddi4spn, 0)                    \
+        _(clw, 0)                          \
+        _(csw, 0)                          \
+        _(cnop, 0)                         \
+        _(caddi, 0)                        \
+        _(cjal, 1)                         \
+        _(cli, 0)                          \
+        _(caddi16sp, 0)                    \
+        _(clui, 0)                         \
+        _(csrli, 0)                        \
+        _(csrai, 0)                        \
+        _(candi, 0)                        \
+        _(csub, 0)                         \
+        _(cxor, 0)                         \
+        _(cor, 0)                          \
+        _(cand, 0)                         \
+        _(cj, 1)                           \
+        _(cbeqz, 1)                        \
+        _(cbnez, 1)                        \
+        _(cslli, 0)                        \
+        _(clwsp, 0)                        \
+        _(cjr, 1)                          \
+        _(cmv, 0)                          \
+        _(cebreak, 1)                      \
+        _(cjalr, 1)                        \
+        _(cadd, 0)                         \
+        _(cswsp, 0)                        \
     )
 /* clang-format on */
 
 /* IR list */
 enum {
-#define _(inst) rv_insn_##inst,
+#define _(inst, can_branch) rv_insn_##inst,
     RISCV_INSN_LIST
 #undef _
 };
@@ -226,7 +228,7 @@ enum {
     INSN_32 = 4,
 };
 
-typedef struct {
+typedef struct rv_insn {
     union {
         int32_t imm;
         uint8_t rs3;
@@ -241,6 +243,22 @@ typedef struct {
 
     /* instruction length */
     uint8_t insn_len;
+
+    /* According to tail-call optimization (TCO), if a C function ends with
+     * a function call to another function or itself and simply returns that
+     * function's result, the compiler can substitute a simple jump to the
+     * other function for the 'call' and 'return' instructions . The self
+     * -recursive function can therefore use the same function stack frame.
+     *
+     * Using member tailcall, we can tell whether an IR is the final IR in
+     * a basic block. Additionally, member 'impl' allows us to invoke next
+     * instruction emulation directly without computing the jumping address.
+     * In order to enable the compiler to perform TCO, we can use these two
+     * members to rewrite all instruction emulations into a self-recursive
+     * version.
+     */
+    bool tailcall;
+    bool (*impl)(riscv_t *, const struct rv_insn *);
 } rv_insn_t;
 
 /* decode the RISC-V instruction */
diff --git a/src/emulate.c b/src/emulate.c
index 8960dbf87..327b6be5a 100644
--- a/src/emulate.c
+++ b/src/emulate.c
@@ -28,6 +28,7 @@ static inline int isnanf(float x)
 extern struct target_ops gdbstub_ops;
 #endif
 
+#include "common.h"
 #include "decode.h"
 #include "riscv.h"
 #include "riscv_private.h"
@@ -254,100 +255,90 @@ static inline bool insn_is_misaligned(uint32_t pc)
     );
 }
 
-/* execute a basic block */
-static bool emulate(riscv_t *rv, const block_t *block)
-{
-#if RV32_HAS(COMPUTED_GOTO)
-    static const void *dispatch_table[] = {
-#define _(inst) [rv_insn_##inst] = &&do_##inst,
-        RISCV_INSN_LIST
-#undef _
-    };
-
-#define DISPATCH()              \
-    /* enforce zero register */ \
-    rv->X[rv_reg_zero] = 0;     \
-    /* current IR */            \
-    ir = block->ir + index++;   \
-    /* jump */                  \
-    goto *dispatch_table[ir->opcode];
-
-/* clang-format off */
-#define _(inst, code)                    \
-    do_##inst: code                      \
-    /* step over instruction */          \
-    rv->PC += ir->insn_len;              \
-    /* increment the cycles CSR */       \
-    rv->csr_cycle++;                     \
-    /* all instructions have executed */ \
-    if (unlikely(index == n_insn))       \
-        return true;                     \
-    DISPATCH()
-/* clang-format on */
-#define EPILOGUE()
-
-#else /* !RV32_HAS(COMPUTED_GOTO) */
-#define DISPATCH()                          \
-    for (uint32_t i = 0; i < n_insn; i++) { \
-        ir = block->ir + i;                 \
-        /* enforce zero register */         \
-        rv->X[rv_reg_zero] = 0;             \
-        switch (ir->opcode) {
-/* clang-format off */
-#define _(inst, code)         \
-    case rv_insn_##inst: code \
-        break;
-#define EPILOGUE()                          \
-        }                                   \
-        /* step over instruction */         \
-        rv->PC += ir->insn_len;             \
-        /* increment the cycles csr */      \
-        rv->csr_cycle++;                    \
-    }                                       \
-    return true;
-/* clang-format on */
-#endif /* RV32_HAS(COMPUTED_GOTO) */
-
-    const uint32_t n_insn = block->n_insn;
-    rv_insn_t *ir;
+#define RVOP(inst, code)                                                  \
+    static bool do_##inst(riscv_t *rv UNUSED, const rv_insn_t *ir UNUSED) \
+    {                                                                     \
+        rv->X[rv_reg_zero] = 0;                                           \
+        code rv->PC += ir->insn_len;                                      \
+        rv->csr_cycle++;                                                  \
+        if (ir->tailcall)                                                 \
+            return true;                                                  \
+        const rv_insn_t *next = ir + 1;                                   \
+        MUST_TAIL return next->impl(rv, next);                            \
+    }
 
-#if RV32_HAS(COMPUTED_GOTO)
-    /* current index in block */
-    uint32_t index = 0;
-#endif
+/* RV32I Base Instruction Set */
 
-    /* main loop */
-    DISPATCH()
+/* Internal */
+RVOP(nop, {/* no operation */});
 
-    /* Internal */
-    _(nop, /* no operation */)
+/* LUI is used to build 32-bit constants and uses the U-type format. LUI
+ * places the U-immediate value in the top 20 bits of the destination
+ * register rd, filling in the lowest 12 bits with zeros. The 32-bit
+ * result is sign-extended to 64 bits.
+ */
+RVOP(lui, { rv->X[ir->rd] = ir->imm; })
 
-    /* LUI (Load Upper Immediate) is used to build 32-bit constants and uses the
-     * U-type format. LUI places the U-immediate value in the top 20 bits of the
-     * destination register rd, filling in the lowest 12 bits with zeros. The
-     * 32-bit result is sign-extended to 64 bits.
-     */
-    _(lui, rv->X[ir->rd] = ir->imm;)
+/* AUIPC is used to build pc-relative addresses and uses the U-type
+ * format. AUIPC forms a 32-bit offset from the 20-bit U-immediate,
+ * filling in the lowest 12 bits with zeros, adds this offset to the
+ * address of the AUIPC instruction, then places the result in register
+ * rd.
+ */
+RVOP(auipc, { rv->X[ir->rd] = ir->imm + rv->PC; })
 
-    /* AUIPC (Add Upper Immediate to PC) is used to build pc-relative addresses
-     * and uses the U-type format. AUIPC forms a 32-bit offset from the 20-bit
-     * U-immediate, filling in the lowest 12 bits with zeros, adds this offset
-     * to the address of the AUIPC instruction, then places the result in
-     * register rd.
-     */
-    _(auipc, rv->X[ir->rd] = ir->imm + rv->PC;)
+/* JAL: Jump and Link
+ * store successor instruction address into rd.
+ * add next J imm (offset) to pc.
+ */
+RVOP(jal, {
+    const uint32_t pc = rv->PC;
+    /* Jump */
+    rv->PC += ir->imm;
+    /* link with return address */
+    if (ir->rd)
+        rv->X[ir->rd] = pc + ir->insn_len;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
+
+/*The indirect jump instruction JALR uses the I-type encoding. The
+ * target address is obtained by adding the sign-extended 12-bit
+ * I-immediate to the register rs1, then setting the least-significant
+ * bit of the result to zero. The address of the instruction following
+ * the jump (pc+4) is written to register rd. Register x0 can be used as
+ * the destination if the result is not required.
+ */
+RVOP(jalr, {
+    const uint32_t pc = rv->PC;
+    /* jump */
+    rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U;
+    /* link */
+    if (ir->rd)
+        rv->X[ir->rd] = pc + ir->insn_len;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* JAL: Jump and Link
-     * store successor instruction address into rd.
-     * add next J imm (offset) to pc.
-     */
-    _(jal, {
-        const uint32_t pc = rv->PC;
-        /* Jump */
+/* BEQ: Branch if Equal */
+RVOP(beq, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] == rv->X[ir->rs2]) {
         rv->PC += ir->imm;
-        /* link with return address */
-        if (ir->rd)
-            rv->X[ir->rd] = pc + ir->insn_len;
         /* check instruction misaligned */
         if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
@@ -357,24 +348,16 @@ static bool emulate(riscv_t *rv, const block_t *block)
         /* increment the cycles csr */
         rv->csr_cycle++;
         /* can branch */
+        rv->csr_cycle++;
         return true;
-    })
-
-    /* JALR: Jump and Link Register
-     * The indirect jump instruction JALR uses the I-type encoding. The
-     * target address is obtained by adding the sign-extended 12-bit
-     * I-immediate to the register rs1, then setting the least-significant
-     * bit of the result to zero. The address of the instruction following
-     * the jump (pc+4) is written to register rd. Register x0 can be used as
-     * the destination if the result is not required.
-     */
-    _(jalr, {
-        const uint32_t pc = rv->PC;
-        /* jump */
-        rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U;
-        /* link */
-        if (ir->rd)
-            rv->X[ir->rd] = pc + ir->insn_len;
+    }
+})
+
+/* BNE: Branch if Not Equal */
+RVOP(bne, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] != rv->X[ir->rs2]) {
+        rv->PC += ir->imm;
         /* check instruction misaligned */
         if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
@@ -384,987 +367,931 @@ static bool emulate(riscv_t *rv, const block_t *block)
         /* increment the cycles csr */
         rv->csr_cycle++;
         /* can branch */
+        rv->csr_cycle++;
         return true;
-    })
-
-    /* BEQ: Branch if Equal */
-    _(beq, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] == rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BNE: Branch if Not Equal */
-    _(bne, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] != rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BLT: Branch if Less Than */
-    _(blt, {
-        const uint32_t pc = rv->PC;
-        if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BGE: Branch if Greater Than */
-    _(bge, {
-        const uint32_t pc = rv->PC;
-        if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BLTU: Branch if Less Than Unsigned */
-    _(bltu, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] < rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BGEU: Branch if Greater Than Unsigned */
-    _(bgeu, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] >= rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* LB: Load Byte */
-    _(lb, {
-        rv->X[ir->rd] =
-            sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm));
-    })
-
-    /* LH: Load Halfword */
-    _(lh, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
-            rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr));
-    })
+    }
+})
 
-    /* LW: Load Word */
-    _(lw, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 3)) {
+/* BLT: Branch if Less Than */
+RVOP(blt, {
+    const uint32_t pc = rv->PC;
+    if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2]) {
+        rv->PC += ir->imm;
+        /* check instruction misaligned */
+        if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
+            rv_except_insn_misaligned(rv, pc);
             return false;
         }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
-
-    /* LBU: Load Byte Unsigned */
-    _(lbu, rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm);)
+        /* can branch */
+        rv->csr_cycle++;
+        return true;
+    }
+})
 
-    /* LHU: Load Halfword Unsigned */
-    _(lhu, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
+/* BGE: Branch if Greater Than */
+RVOP(bge, {
+    const uint32_t pc = rv->PC;
+    if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2]) {
+        rv->PC += ir->imm;
+        /* check instruction misaligned */
+        if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
+            rv_except_insn_misaligned(rv, pc);
             return false;
         }
-        rv->X[ir->rd] = rv->io.mem_read_s(rv, addr);
-    })
-
-    /* SB: Store Byte */
-    _(sb, rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]);)
+        /* can branch */
+        rv->csr_cycle++;
+        return true;
+    }
+})
 
-    /* SH: Store Halfword */
-    _(sh, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
+/* BLTU: Branch if Less Than Unsigned */
+RVOP(bltu, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] < rv->X[ir->rs2]) {
+        rv->PC += ir->imm;
+        /* check instruction misaligned */
+        if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
-            rv_except_store_misaligned(rv, addr);
+            rv_except_insn_misaligned(rv, pc);
             return false;
         }
-        rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]);
-    })
+        /* can branch */
+        rv->csr_cycle++;
+        return true;
+    }
+})
 
-    /* SW: Store Word */
-    _(sw, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 3)) {
+/* BGEU: Branch if Greater Than Unsigned */
+RVOP(bgeu, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] >= rv->X[ir->rs2]) {
+        rv->PC += ir->imm;
+        /* check instruction misaligned */
+        if (unlikely(insn_is_misaligned(rv->PC))) {
             rv->compressed = false;
-            rv_except_store_misaligned(rv, addr);
+            rv_except_insn_misaligned(rv, pc);
             return false;
         }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
+        /* can branch */
+        rv->csr_cycle++;
+        return true;
+    }
+})
+
+/* LB: Load Byte */
+RVOP(lb, {
+    rv->X[ir->rd] =
+        sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm));
+})
+
+/* LH: Load Halfword */
+RVOP(lh, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr));
+})
+
+/* LW: Load Word */
+RVOP(lw, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
+
+/* LBU: Load Byte Unsigned */
+RVOP(lbu, { rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm); })
+
+/* LHU: Load Halfword Unsigned */
+RVOP(lhu, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_s(rv, addr);
+})
+
+/* SB: Store Byte */
+RVOP(sb, { rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); })
+
+/* SH: Store Halfword */
+RVOP(sh, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]);
+})
+
+/* SW: Store Word */
+RVOP(sw, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = false;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
 
-    /* ADDI (Add Immediate) adds the sign-extended 12-bit immediate to register
-     * rs1. Arithmetic overflow is ignored and the result is simply the low XLEN
-     * bits of the result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1
-     * assembler pseudo-instruction.
-     */
-    _(addi, rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm;)
+/* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic
+ * overflow is ignored and the result is simply the low XLEN bits of the
+ * result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler
+ * pseudo-instruction.
+ */
+RVOP(addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; })
 
-    /* SLTI (Set on Less Than Immediate) places the value 1 in register rd if
-     * register rs1 is less than the signextended immediate when both are
-     * treated as signed numbers, else 0 is written to rd.
-     */
-    _(slti, rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0;)
+/* SLTI place the value 1 in register rd if register rs1 is less than the
+ * signextended immediate when both are treated as signed numbers, else
+ * 0 is written to rd.
+ */
+RVOP(slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; })
 
-    /* SLTIU (Set on Less Than Immediate Unsigned) places the value 1 in
-     * register rd if register rs1 is less than the immediate when both are
-     * treated as unsigned numbers, else 0 is written to rd.
-     */
-    _(sltiu, rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0;)
+/* SLTIU places the value 1 in register rd if register rs1 is less than the
+ * immediate when both are treated as unsigned numbers, else 0 is
+ * written to rd.
+ */
+RVOP(sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; })
 
-    /* XORI: Exclusive OR Immediate */
-    _(xori, rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm;)
+/* XORI: Exclusive OR Immediate */
+RVOP(xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; })
 
-    /* ORI: OR Immediate */
-    _(ori, rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm;)
+/* ORI: OR Immediate */
+RVOP(ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; })
 
-    /* ANDI (AND Immediate) performs bitwise AND on register rs1 and the
-     * sign-extended 12-bit immediate and place the result in rd.
-     */
-    _(andi, rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm;)
+/* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit
+ * immediate and place the result in rd.
+ */
+RVOP(andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; })
 
-    /* SLLI (Shift Left Logical) performs logical left shift on the value in
-     * register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(slli, rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f);)
+/* SLLI performs logical left shift on the value in register rs1 by the shift
+ * amount held in the lower 5 bits of the immediate.
+ */
+RVOP(slli, { rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f); })
 
-    /* SRLI (Shift Right Logical) performs logical right shift on the value in
-     * register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(srli, rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f);)
+/* SRLI performs logical right shift on the value in register rs1 by the
+ * shift amount held in the lower 5 bits of the immediate.
+ */
+RVOP(srli, { rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f); })
 
-    /* SRAI (Shift Right Arithmetic) performs arithmetic right shift on the
-     * value in register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(srai, rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f);)
+/* SRAI performs arithmetic right shift on the value in register rs1 by
+ * the shift amount held in the lower 5 bits of the immediate.
+ */
+RVOP(srai, { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f); })
 
-    /* ADD */
-    _(add,
-      rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);)
+/* ADD */
+RVOP(add, {
+    rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);
+})
 
-    /* SUB: Substract */
-    _(sub,
-      rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);)
+/* SUB: Substract */
+RVOP(sub, {
+    rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);
+})
 
-    /* SLL: Shift Left Logical */
-    _(sll, rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f);)
+/* SLL: Shift Left Logical */
+RVOP(sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); })
 
-    /* SLT: Set on Less Than */
-    _(slt, {
-        rv->X[ir->rd] =
-            ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0;
-    })
+/* SLT: Set on Less Than */
+RVOP(slt, {
+    rv->X[ir->rd] =
+        ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0;
+})
 
-    /* SLTU: Set on Less Than Unsigned */
-    _(sltu, rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0;)
+/* SLTU: Set on Less Than Unsigned */
+RVOP(sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; })
 
-    /* XOR: Exclusive OR */
-    _(xor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];)
+/* XOR: Exclusive OR */
+RVOP(xor, {
+  rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];
+})
 
-    /* SRL: Shift Right Logical */
-    _(srl, rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f);)
+/* SRL: Shift Right Logical */
+RVOP(srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); })
 
-    /* SRA: Shift Right Arithmetic */
-    _(sra, {
-        rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f);
-    })
+/* SRA: Shift Right Arithmetic */
+RVOP(sra,
+     { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); })
 
-    /* OR */
-    _(or, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];)
+/* OR */
+RVOP(or, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; })
 
-    /* AND */
-    _(and, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];)
+/* AND */
+RVOP(and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; })
 
-    /* ECALL: Environment Call */
-    _(ecall, {
-        rv->io.on_ecall(rv); /* increment the cycles csr */
-        rv->csr_cycle++;
-        return true;
-    })
+/* ECALL: Environment Call */
+RVOP(ecall, {
+    rv->compressed = false;
+    rv->io.on_ecall(rv);
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* EBREAK: Environment Break */
-    _(ebreak, {
-        rv->io.on_ebreak(rv); /* increment the cycles csr */
-        rv->csr_cycle++;
-        return true;
-    })
+/* EBREAK: Environment Break */
+RVOP(ebreak, {
+    rv->compressed = false;
+    rv->io.on_ebreak(rv);
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* WFI: Wait for Interrupt */
-    _(wfi, return false;)
+/* WFI: Wait for Interrupt */
+RVOP(wfi, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* URET: return from traps in U-mode */
-    _(uret, return false;)
+/* URET: return from traps in U-mode */
+RVOP(uret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* SRET: return from traps in S-mode */
-    _(sret, return false;)
+/* SRET: return from traps in S-mode */
+RVOP(sret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* HRET: return from traps in H-mode */
-    _(hret, return false;)
+/* HRET: return from traps in H-mode */
+RVOP(hret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* MRET: return from traps in U-mode */
-    _(mret, {
-        rv->PC = rv->csr_mepc;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* this is a branch */
-        return true;
-    })
+/* MRET: return from traps in U-mode */
+RVOP(mret, {
+    rv->PC = rv->csr_mepc;
+    /* this is a branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* RV32 Zifencei Standard Extension */
-#if RV32_HAS(Zifencei)
-    _(fencei, /* FIXME: fill real implementations */);
+#if RV32_HAS(Zifencei) /* RV32 Zifencei Standard Extension */
+RVOP(fencei,
+     {
+         /* FIXME: fill real implementations */
+     })
 #endif
 
-    /* RV32 Zicsr Standard Extension */
-#if RV32_HAS(Zicsr)
-    /* CSRRW: Atomic Read/Write CSR */
-    _(csrrw, {
-        uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRS: Atomic Read and Set Bits in CSR */
-    _(csrrs, {
-        uint32_t tmp = csr_csrrs(
-            rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRC: Atomic Read and Clear Bits in CSR */
-    _(csrrc, {
-        uint32_t tmp = csr_csrrc(
-            rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRWI */
-    _(csrrwi, {
-        uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRSI */
-    _(csrrsi, {
-        uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRCI */
-    _(csrrci, {
-        uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-#endif /* RV32_HAS(Zicsr) */
-
-    /* RV32M Standard Extension */
-#if RV32_HAS(EXT_M)
-    /* MUL: Multiply */
-    _(mul, rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2];)
-
-    /* MULH: Multiply High Signed Signed */
-    _(mulh, {
-        const int64_t a = (int32_t) rv->X[ir->rs1];
-        const int64_t b = (int32_t) rv->X[ir->rs2];
-        rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
-    })
-
-    /* MULHSU: Multiply High Signed Unsigned */
-    _(mulhsu, {
-        const int64_t a = (int32_t) rv->X[ir->rs1];
-        const uint64_t b = rv->X[ir->rs2];
-        rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
-    })
-
-    /* MULHU: Multiply High Unsigned Unsigned */
-    _(mulhu, {
-        rv->X[ir->rd] =
-            ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32;
-    })
-
-    /* DIV: Divide Signed */
-    _(div, {
-        const int32_t dividend = (int32_t) rv->X[ir->rs1];
-        const int32_t divisor = (int32_t) rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? ~0U
-                        : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
-                            ? rv->X[ir->rs1] /* overflow */
-                            : (unsigned int) (dividend / divisor);
-    })
-
-    /* DIVU: Divide Unsigned */
-    _(divu, {
-        const uint32_t dividend = rv->X[ir->rs1];
-        const uint32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor;
-    })
-
-    /* REM: Remainder Signed */
-    _(rem, {
-        const int32_t dividend = rv->X[ir->rs1];
-        const int32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? dividend
-                        : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
-                            ? 0 /* overflow */
-                            : (dividend % divisor);
-    })
-
-    /* REMU: Remainder Unsigned */
-    _(remu, {
-        const uint32_t dividend = rv->X[ir->rs1];
-        const uint32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? dividend : dividend % divisor;
-    })
-#endif /* RV32_HAS(EXT_M) */
-
-    /* RV32A Standard Extension
-     * At present, AMO is not implemented atomically because the emulated
-     * RISC-V core just runs on single thread, and no out-of-order execution
-     * happens. In addition, rl/aq are not handled.
+#if RV32_HAS(Zicsr) /* RV32 Zicsr Standard Extension */
+/* CSRRW: Atomic Read/Write CSR */
+RVOP(csrrw, {
+    uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRS: Atomic Read and Set Bits in CSR */
+RVOP(csrrs, {
+    uint32_t tmp =
+        csr_csrrs(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRC: Atomic Read and Clear Bits in CSR */
+RVOP(csrrc, {
+    uint32_t tmp =
+        csr_csrrc(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRWI */
+RVOP(csrrwi, {
+    uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRSI */
+RVOP(csrrsi, {
+    uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRCI */
+RVOP(csrrci, {
+    uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+#endif
+
+#if RV32_HAS(EXT_M) /* RV32M Standard Extension */
+/* MUL: Multiply */
+RVOP(mul,
+     { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; })
+
+/* MULH: Multiply High Signed Signed */
+RVOP(mulh, {
+    const int64_t a = (int32_t) rv->X[ir->rs1];
+    const int64_t b = (int32_t) rv->X[ir->rs2];
+    rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
+})
+
+/* MULHSU: Multiply High Signed Unsigned */
+RVOP(mulhsu, {
+    const int64_t a = (int32_t) rv->X[ir->rs1];
+    const uint64_t b = rv->X[ir->rs2];
+    rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
+})
+
+/* MULHU: Multiply High Unsigned Unsigned */
+RVOP(mulhu, {
+    rv->X[ir->rd] =
+        ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32;
+})
+
+/* DIV: Divide Signed */
+RVOP(div, {
+    const int32_t dividend = (int32_t) rv->X[ir->rs1];
+    const int32_t divisor = (int32_t) rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? ~0U
+                    : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
+                        ? rv->X[ir->rs1] /* overflow */
+                        : (unsigned int) (dividend / divisor);
+})
+
+/* DIVU: Divide Unsigned */
+RVOP(divu, {
+    const uint32_t dividend = rv->X[ir->rs1];
+    const uint32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor;
+})
+
+/* REM: Remainder Signed */
+RVOP(rem, {
+    const int32_t dividend = rv->X[ir->rs1];
+    const int32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? dividend
+                    : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
+                        ? 0 /* overflow */
+                        : (dividend % divisor);
+})
+
+/* REMU: Remainder Unsigned */
+RVOP(remu, {
+    const uint32_t dividend = rv->X[ir->rs1];
+    const uint32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? dividend : dividend % divisor;
+})
+#endif
+
+#if RV32_HAS(EXT_A) /* RV32A Standard Extension */
+/* At present, AMO is not implemented atomically because the rvop_jump_table[(ir
+ * + 1)->opcode]d RISC-V core just runs on single thread, and no out-of-order
+ * execution happens. In addition, rl/aq are not handled.
+ */
+
+/* LR.W: Load Reserved */
+RVOP(lrw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]);
+    /* skip registration of the 'reservation set'
+     * FIXME: uimplemented
      */
-#if RV32_HAS(EXT_A)
-    /* LR.W: Load Reserved */
-    _(lrw, {
-        /* skip registration of the 'reservation set'
-         * FIXME: uimplemented
-         */
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]);
-    })
+})
 
-    /* SC.W: Store Conditional */
-    _(scw, {
-        /* assume the 'reservation set' is valid
-         * FIXME: unimplemented
-         */
-        rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]);
-        rv->X[ir->rd] = 0;
-    })
-
-    /* AMOSWAP.W: Atomic Swap */
-    _(amoswapw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]);
-    })
-
-    /* AMOADD.W: Atomic ADD */
-    _(amoaddw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOXOR.W: Atomix XOR */
-    _(amoxorw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOAND.W: Atomic AND */
-    _(amoandw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOOR.W: Atomic OR */
-    _(amoorw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMIN.W: Atomic MIN */
-    _(amominw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t a = rv->X[ir->rd];
-        const int32_t b = rv->X[ir->rs2];
-        const int32_t res = a < b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMAX.W: Atomic MAX */
-    _(amomaxw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t a = rv->X[ir->rd];
-        const int32_t b = rv->X[ir->rs2];
-        const int32_t res = a > b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMINU.W */
-    _(amominuw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const uint32_t a = rv->X[ir->rd];
-        const uint32_t b = rv->X[ir->rs2];
-        const uint32_t res = a < b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMAXU.W */
-    _(amomaxuw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const uint32_t a = rv->X[ir->rd];
-        const uint32_t b = rv->X[ir->rs2];
-        const uint32_t res = a > b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
+/* SC.W: Store Conditional */
+RVOP(scw, {
+    /* assume the 'reservation set' is valid
+     * FIXME: unimplemented
+     */
+    rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]);
+    rv->X[ir->rd] = 0;
+})
+
+/* AMOSWAP.W: Atomic Swap */
+RVOP(amoswapw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]);
+})
+
+/* AMOADD.W: Atomic ADD */
+RVOP(amoaddw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOXOR.W: Atomix XOR */
+RVOP(amoxorw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOAND.W: Atomic AND */
+RVOP(amoandw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOOR.W: Atomic OR */
+RVOP(amoorw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMIN.W: Atomic MIN */
+RVOP(amominw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t a = rv->X[ir->rd];
+    const int32_t b = rv->X[ir->rs2];
+    const int32_t res = a < b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMAX.W: Atomic MAX */
+RVOP(amomaxw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t a = rv->X[ir->rd];
+    const int32_t b = rv->X[ir->rs2];
+    const int32_t res = a > b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMINU.W */
+RVOP(amominuw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const uint32_t a = rv->X[ir->rd];
+    const uint32_t b = rv->X[ir->rs2];
+    const uint32_t res = a < b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMAXU.W */
+RVOP(amomaxuw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const uint32_t a = rv->X[ir->rd];
+    const uint32_t b = rv->X[ir->rs2];
+    const uint32_t res = a > b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
 #endif /* RV32_HAS(EXT_A) */
 
-    /* RV32F Standard Extension */
-#if RV32_HAS(EXT_F)
-    /* FLW */
-    _(flw, {
-        /* copy into the float register */
-        const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm);
-        memcpy(rv->F + ir->rd, &data, 4);
-    })
-
-    /* FSW */
-    _(fsw, {
-        /* copy from float registers */
-        uint32_t data;
-        memcpy(&data, (const void *) (rv->F + ir->rs2), 4);
-        rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data);
-    })
-
-    /* FMADD.S */
-    _(fmadds, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3];)
-
-    /* FMSUB.S */
-    _(fmsubs, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3];)
-
-    /* FNMSUB.S */
-    _(fnmsubs,
-      rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]);)
-
-    /* FNMADD.S */
-    _(fnmadds,
-      rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3];)
-
-    /* FADD.S */
-    _(fadds, {
-        if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) ||
-            isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) {
-            /* raise invalid operation */
-            rv->F_int[ir->rd] = RV_NAN;
-            /* F_int is the integer shortcut of F */
+#if RV32_HAS(EXT_F) /* RV32F Standard Extension */
+/* FLW */
+RVOP(flw, {
+    /* copy into the float register */
+    const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm);
+    memcpy(rv->F + ir->rd, &data, 4);
+})
+
+/* FSW */
+RVOP(fsw, {
+    /* copy from float registers */
+    uint32_t data;
+    memcpy(&data, (const void *) (rv->F + ir->rs2), 4);
+    rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data);
+})
+
+/* FMADD.S */
+RVOP(fmadds,
+     { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3]; })
+
+/* FMSUB.S */
+RVOP(fmsubs,
+     { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3]; })
+
+/* FNMSUB.S */
+RVOP(fnmsubs,
+     { rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]); })
+
+/* FNMADD.S */
+RVOP(fnmadds,
+     { rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3]; })
+
+/* FADD.S */
+RVOP(fadds, {
+    if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) ||
+        isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) {
+        /* raise invalid operation */
+        rv->F_int[ir->rd] = RV_NAN; /* F_int is the integer shortcut of F */
+        rv->csr_fcsr |= FFLAG_INVALID_OP;
+    } else {
+        rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2];
+    }
+    if (isinff(rv->F[ir->rd])) {
+        rv->csr_fcsr |= FFLAG_OVERFLOW;
+        rv->csr_fcsr |= FFLAG_INEXACT;
+    }
+})
+
+/* FSUB.S */
+RVOP(fsubs, {
+    if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) {
+        rv->F_int[ir->rd] = RV_NAN;
+    } else {
+        rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2];
+    }
+})
+/* FMUL.S */
+RVOP(fmuls, { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2]; })
+
+/* FDIV.S */
+RVOP(fdivs, { rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2]; })
+
+/* FSQRT.S */
+RVOP(fsqrts, { rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]); })
+
+/* FSGNJ.S */
+RVOP(fsgnjs, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FSGNJN.S */
+RVOP(fsgnjns, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FSGNJX.S */
+RVOP(fsgnjxs, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = f1 ^ (f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FMIN.S
+ * In IEEE754-201x, fmin(x, y) return
+ * - min(x,y) if both numbers are not NaN
+ * - if one is NaN and another is a number, return the number
+ * - if both are NaN, return NaN
+ * When input is signaling NaN, raise invalid operation
+ */
+RVOP(fmins, {
+    uint32_t x;
+    uint32_t y;
+    memcpy(&x, rv->F + ir->rs1, 4);
+    memcpy(&y, rv->F + ir->rs2, 4);
+    if (is_nan(x) || is_nan(y)) {
+        if (is_snan(x) || is_snan(y))
             rv->csr_fcsr |= FFLAG_INVALID_OP;
+        if (is_nan(x) && !is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs2];
+        } else if (!is_nan(x) && is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs1];
         } else {
-            rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2];
-        }
-        if (isinff(rv->F[ir->rd])) {
-            rv->csr_fcsr |= FFLAG_OVERFLOW;
-            rv->csr_fcsr |= FFLAG_INEXACT;
-        }
-    })
-
-    /* FSUB.S */
-    _(fsubs, {
-        if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) {
             rv->F_int[ir->rd] = RV_NAN;
+        }
+    } else {
+        uint32_t a_sign;
+        uint32_t b_sign;
+        a_sign = x & FMASK_SIGN;
+        b_sign = y & FMASK_SIGN;
+        if (a_sign != b_sign) {
+            rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2];
         } else {
-            rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2];
+            rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? rv->F[ir->rs1]
+                                                              : rv->F[ir->rs2];
         }
-    })
-
-    /* FMUL.S */
-    _(fmuls, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2];)
-
-    /* FDIV.S */
-    _(fdivs, rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2];)
-
-    /* FSQRT.S */
-    _(fsqrts, rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]);)
-
-    /* FSGNJ.S */
-    _(fsgnjs, {
-        uint32_t f1;
-        uint32_t f2;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        uint32_t res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FSGNJN.S */
-    _(fsgnjns, {
-        uint32_t f1;
-        uint32_t f2;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        uint32_t res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FSGNJX.S */
-    _(fsgnjxs, {
-        uint32_t f1;
-        uint32_t f2;
-        uint32_t res;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        res = f1 ^ (f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FMIN.S */
-    _(fmins, {
-        /* In IEEE754-201x, fmin(x, y) return
-         * - min(x,y) if both numbers are not NaN
-         * - if one is NaN and another is a number, return the number
-         * - if both are NaN, return NaN
-         * When input is signaling NaN, raise invalid operation
-         */
-        uint32_t x;
-        uint32_t y;
-        memcpy(&x, rv->F + ir->rs1, 4);
-        memcpy(&y, rv->F + ir->rs2, 4);
-        if (is_nan(x) || is_nan(y)) {
-            if (is_snan(x) || is_snan(y))
-                rv->csr_fcsr |= FFLAG_INVALID_OP;
-            if (is_nan(x) && !is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs2];
-            } else if (!is_nan(x) && is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs1];
-            } else {
-                rv->F_int[ir->rd] = RV_NAN;
-            }
+    }
+})
+
+/* FMAX.S */
+RVOP(fmaxs, {
+    uint32_t x;
+    uint32_t y;
+    memcpy(&x, rv->F + ir->rs1, 4);
+    memcpy(&y, rv->F + ir->rs2, 4);
+    if (is_nan(x) || is_nan(y)) {
+        if (is_snan(x) || is_snan(y))
+            rv->csr_fcsr |= FFLAG_INVALID_OP;
+        if (is_nan(x) && !is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs2];
+        } else if (!is_nan(x) && is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs1];
         } else {
-            uint32_t a_sign;
-            uint32_t b_sign;
-            a_sign = x & FMASK_SIGN;
-            b_sign = y & FMASK_SIGN;
-            if (a_sign != b_sign) {
-                rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2];
-            } else {
-                rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2])
-                                    ? rv->F[ir->rs1]
-                                    : rv->F[ir->rs2];
-            }
+            rv->F_int[ir->rd] = RV_NAN;
         }
-    })
-
-    /* FMAX.S */
-    _(fmaxs, {
-        uint32_t x;
-        uint32_t y;
-        memcpy(&x, rv->F + ir->rs1, 4);
-        memcpy(&y, rv->F + ir->rs2, 4);
-        if (is_nan(x) || is_nan(y)) {
-            if (is_snan(x) || is_snan(y))
-                rv->csr_fcsr |= FFLAG_INVALID_OP;
-            if (is_nan(x) && !is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs2];
-            } else if (!is_nan(x) && is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs1];
-            } else {
-                rv->F_int[ir->rd] = RV_NAN;
-            }
+    } else {
+        uint32_t a_sign;
+        uint32_t b_sign;
+        a_sign = x & FMASK_SIGN;
+        b_sign = y & FMASK_SIGN;
+        if (a_sign != b_sign) {
+            rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1];
         } else {
-            uint32_t a_sign;
-            uint32_t b_sign;
-            a_sign = x & FMASK_SIGN;
-            b_sign = y & FMASK_SIGN;
-            if (a_sign != b_sign) {
-                rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1];
-            } else {
-                rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2])
-                                    ? rv->F[ir->rs1]
-                                    : rv->F[ir->rs2];
-            }
+            rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2]) ? rv->F[ir->rs1]
+                                                              : rv->F[ir->rs2];
         }
-    })
-
-    /* FCVT.W.S */
-    _(fcvtws, rv->X[ir->rd] = (int32_t) rv->F[ir->rs1];)
+    }
+})
 
-    /* FCVT.WU.S */
-    _(fcvtwus, rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1];)
+/* FCVT.W.S */
+RVOP(fcvtws, { rv->X[ir->rd] = (int32_t) rv->F[ir->rs1]; })
 
-    /* FMV.X.W */
-    _(fmvxw, memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4);)
+/* FCVT.WU.S */
+RVOP(fcvtwus, { rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1]; })
 
-    /* FEQ.S performs a quiet comparison: it only sets the invalid
-     * operation exception flag if either input is a signaling NaN.
-     */
-    _(feqs, rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0;)
+/* FMV.X.W */
+RVOP(fmvxw, { memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4); })
 
-    /* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers
-     * to as signaling comparisons: that is, they set the invalid
-     * operation exception flag if either input is NaN.
-     */
-    _(flts, rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0;)
+/* FEQ.S performs a quiet comparison: it only sets the invalid
+ * operation exception flag if either input is a signaling NaN.
+ */
+RVOP(feqs, { rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FLE.S */
-    _(fles, rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0;)
+/* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers
+ * to as signaling comparisons: that is, they set the invalid
+ * operation exception flag if either input is NaN.
+ */
+RVOP(flts, { rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FCLASS.S */
-    _(fclasss, {
-        uint32_t bits;
-        memcpy(&bits, rv->F + ir->rs1, 4);
-        rv->X[ir->rd] = calc_fclass(bits);
-    })
+RVOP(fles, { rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FCVT.S.W */
-    _(fcvtsw, rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1];)
+/* FCLASS.S */
+RVOP(fclasss, {
+    uint32_t bits;
+    memcpy(&bits, rv->F + ir->rs1, 4);
+    rv->X[ir->rd] = calc_fclass(bits);
+})
 
-    /* FCVT.S.WU */
-    _(fcvtswu, rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1];)
+/* FCVT.S.W */
+RVOP(fcvtsw, { rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1]; })
 
-    /* FMV.W.X */
-    _(fmvwx, memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4);)
-#endif /* RV32_HAS(EXT_F) */
+/* FCVT.S.WU */
+RVOP(fcvtswu, { rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1]; })
 
-    /* RV32C Standard Extension */
-#if RV32_HAS(EXT_C)
-    /* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended
-     * non-zero immediate, scaledby 4, to the stack pointer, x2, and
-     * writes the result to rd'. This instruction is used to generate
-     * pointers to stack-allocated variables, and expands to addi rd',
-     * x2, nzuimm[9:2].
-     */
-    _(caddi4spn, rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm;)
+/* FMV.W.X */
+RVOP(fmvwx, { memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4); })
+#endif
 
-    /* C.LW loads a 32-bit value from memory into register rd'. It
-     * computes an ffective address by adding the zero-extended offset,
-     * scaled by 4, to the base address in register rs1'. It expands to
-     * # lw rd', offset[6:2](rs1').
-     */
-    _(clw, {
-        const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
+#if RV32_HAS(EXT_C) /* RV32C Standard Extension */
+/* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended
+ * non-zero immediate, scaledby 4, to the stack pointer, x2, and writes
+ * the result to rd'. This instruction is used to generate pointers to
+ * stack-allocated variables, and expands to addi rd', x2, nzuimm[9:2].
+ */
+RVOP(caddi4spn, { rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm; })
 
-    /* C.SW stores a 32-bit value in register rs2' to memory. It computes
-     * an effective address by adding the zero-extended offset, scaled by
-     * 4, to the base address in register rs1'.
-     * It expands to sw rs2', offset[6:2](rs1')
-     */
-    _(csw, {
-        const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
-
-    /* C.NOP */
-    _(cnop, /* nothing */)
-
-    /* C.ADDI adds the non-zero sign-extended 6-bit immediate to the
-     * value in register rd then writes the result to rd. C.ADDI expands
-     * into addi rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0.
-     * The code point with both rd=x0 and nzimm=0 encodes the C.NOP
-     * instruction; the remaining code points with either rd=x0 or
-     * nzimm=0 encode HINTs.
-     */
-    _(caddi, rv->X[ir->rd] += (int16_t) ir->imm;)
+/* C.LW loads a 32-bit value from memory into register rd'. It computes
+ * an ffective address by adding the zero-extended offset, scaled by 4,
+ * to the base address in register rs1'. It expands to  # lw rd',
+ * offset[6:2](rs1').
+ */
+RVOP(clw, {
+    const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
 
-    /* C.JAL */
-    _(cjal, {
-        rv->X[1] = rv->PC + ir->insn_len;
-        rv->PC += ir->imm;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.SW stores a 32-bit value in register rs2' to memory. It computes an
+ * effective address by adding the zero-extended offset, scaled by 4, to
+ * the base address in register rs1'.
+ * It expands to sw rs2', offset[6:2](rs1')
+ */
+RVOP(csw, {
+    const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
+
+/* C.NOP */
+RVOP(cnop,
+     {
+         /* no operation */
+     })
+
+/* C.ADDI adds the non-zero sign-extended 6-bit immediate to the value
+ * in register rd then writes the result to rd. C.ADDI expands into addi
+ * rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0. The code point
+ * with both rd=x0 and nzimm=0 encodes the C.NOP instruction; the
+ * remaining code points with either rd=x0 or nzimm=0 encode HINTs.
+ */
+RVOP(caddi, { rv->X[ir->rd] += (int16_t) ir->imm; })
+
+/* C.JAL */
+RVOP(cjal, {
+    rv->X[1] = rv->PC + ir->insn_len;
+    rv->PC += ir->imm;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* C.LI loads the sign-extended 6-bit immediate, imm, into
-     * register rd.
-     * C.LI expands into addi rd, x0, imm[5:0].
-     * C.LI is only valid when rd=x0; the code points with rd=x0 encode
-     * HINTs.
-     */
-    _(cli, rv->X[ir->rd] = ir->imm;)
+/* C.LI loads the sign-extended 6-bit immediate, imm, into register rd.
+ * C.LI expands into addi rd, x0, imm[5:0].
+ * C.LI is only valid when rd=x0; the code points with rd=x0 encode
+ * HINTs.
+ */
+RVOP(cli, { rv->X[ir->rd] = ir->imm; })
 
-    /* C.ADDI16SP is used to adjust the stack pointer in procedure
-     * prologues and epilogues.
-     * It expands into addi x2, x2, nzimm[9:4].
-     * C.ADDI16SP is only valid when nzimm̸=0; the code point with
-     * nzimm=0 is reserved.
-     */
-    _(caddi16sp, rv->X[ir->rd] += ir->imm;)
-
-    /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of
-     * the destination register, clears the bottom 12 bits, and
-     * sign-extends bit 17 into all higher bits of the destination.
-     * C.LUI expands into lui rd, nzimm[17:12].
-     * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is
-     * not equal to zero.
-     */
-    _(clui, rv->X[ir->rd] = ir->imm;)
+/* C.ADDI16SP is used to adjust the stack pointer in procedure
+ * prologues and epilogues.
+ * It expands into addi x2, x2, nzimm[9:4].
+ * C.ADDI16SP is only valid when nzimm̸=0; the code point with nzimm=0
+ * is reserved.
+ */
+RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; })
+
+/* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the
+ * destination register, clears the bottom 12 bits, and sign-extends bit
+ * 17 into all higher bits of the destination.
+ * C.LUI expands into lui rd, nzimm[17:12].
+ * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is not
+ * equal to zero.
+ */
+RVOP(clui, { rv->X[ir->rd] = ir->imm; })
 
-    /* C.SRLI is a CB-format instruction that performs a logical right
-     * shift of the value in register rd' then writes the result to rd'.
-     * The shift amount is encoded in the shamt field. C.SRLI expands
-     * into srli rd', rd', shamt[5:0].
-     */
-    _(csrli, rv->X[ir->rs1] >>= ir->shamt;)
+/* C.SRLI is a CB-format instruction that performs a logical right shift
+ * of the value in register rd' then writes the result to rd'. The shift
+ * amount is encoded in the shamt field. C.SRLI expands into srli rd',
+ * rd', shamt[5:0].
+ */
+RVOP(csrli, { rv->X[ir->rs1] >>= ir->shamt; })
 
-    /* C.SRAI is defined analogously to C.SRLI, but instead performs an
-     * arithmetic right shift.
-     * C.SRAI expands to srai rd', rd', shamt[5:0].
-     */
-    _(csrai, {
-        const uint32_t mask = 0x80000000 & rv->X[ir->rs1];
-        rv->X[ir->rs1] >>= ir->shamt;
-        for (unsigned int i = 0; i < ir->shamt; ++i)
-            rv->X[ir->rs1] |= mask >> i;
-    })
-
-    /* C.ANDI is a CB-format instruction that computes the bitwise AND of
-     * the value in register rd' and the sign-extended 6-bit immediate,
-     * then writes the result to rd'.
-     * C.ANDI expands to andi rd', rd', imm[5:0].
-     */
-    _(candi, rv->X[ir->rs1] &= ir->imm;)
+/* C.SRAI is defined analogously to C.SRLI, but instead performs an
+ * arithmetic right shift. C.SRAI expands to srai rd', rd', shamt[5:0].
+ */
+RVOP(csrai, {
+    const uint32_t mask = 0x80000000 & rv->X[ir->rs1];
+    rv->X[ir->rs1] >>= ir->shamt;
+    for (unsigned int i = 0; i < ir->shamt; ++i)
+        rv->X[ir->rs1] |= mask >> i;
+})
+
+/* C.ANDI is a CB-format instruction that computes the bitwise AND of
+ * the value in register rd' and the sign-extended 6-bit immediate, then
+ * writes the result to rd'. C.ANDI expands to andi rd', rd', imm[5:0].
+ */
+RVOP(candi, { rv->X[ir->rs1] &= ir->imm; })
 
-    /* C.SUB */
-    _(csub, rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2];)
+/* C.SUB */
+RVOP(csub, { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; })
 
-    /* C.XOR */
-    _(cxor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];)
+/* C.XOR */
+RVOP(cxor, { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; })
 
-    /* C.OR */
-    _(cor, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];)
+RVOP(cor, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; })
 
-    /* C.AND */
-    _(cand, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];)
+RVOP(cand, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; })
 
-    /* C.J performs an unconditional control transfer. The offset is
-     * sign-extended and added to the pc to form the jump target address.
-     * C.J can therefore target a ±2 KiB range.
-     * C.J expands to jal x0, offset[11:1].
-     */
-    _(cj, {
-        rv->PC += ir->imm;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* C.BEQZ performs conditional control transfers. The offset is
-     * sign-extended and added to the pc to form the branch target
-     * address. It can therefore target a ±256 B range. C.BEQZ takes the
-     * branch if the value in register rs1' is zero.
-     * It expands to beq rs1', x0, offset[8:1].
-     */
-    _(cbeqz, {
-        rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    _(cbnez, {
-        rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.J performs an unconditional control transfer. The offset is
+ * sign-extended and added to the pc to form the jump target address.
+ * C.J can therefore target a ±2 KiB range.
+ * C.J expands to jal x0, offset[11:1].
+ */
+RVOP(cj, {
+    rv->PC += ir->imm;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* C.SLLI is a CI-format instruction that performs a logical left
-     * shift of the value in register rd then writes the result to rd.
-     * The shift amount is encoded in the shamt field.
-     * C.SLLI expands into slli rd, rd, shamt[5:0].
-     */
-    _(cslli, rv->X[ir->rd] <<= (uint8_t) ir->imm;)
-
-    /* C.LWSP */
-    _(clwsp, {
-        const uint32_t addr = rv->X[rv_reg_sp] + ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
+/* C.BEQZ performs conditional control transfers. The offset is
+ * sign-extended and added to the pc to form the branch target address.
+ * It can therefore target a ±256 B range. C.BEQZ takes the branch if
+ * the value in register rs1' is zero. It expands to beq rs1', x0,
+ * offset[8:1].
+ */
+RVOP(cbeqz, {
+    rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
+/* C.BEQZ */
+RVOP(cbnez, {
+    rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* C.JR */
-    _(cjr, {
-        rv->PC = rv->X[ir->rs1];
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.SLLI is a CI-format instruction that performs a logical left shift
+ * of the value in register rd then writes the result to rd. The shift
+ * amount is encoded in the shamt field. C.SLLI expands into slli rd,
+ * rd, shamt[5:0].
+ */
+RVOP(cslli, { rv->X[ir->rd] <<= (uint8_t) ir->imm; })
+
+/* C.LWSP */
+RVOP(clwsp, {
+    const uint32_t addr = rv->X[rv_reg_sp] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
+
+/* C.JR */
+RVOP(cjr, {
+    rv->PC = rv->X[ir->rs1];
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
 
-    /* C.MV */
-    _(cmv, rv->X[ir->rd] = rv->X[ir->rs2];)
+/* C.MV */
+RVOP(cmv, { rv->X[ir->rd] = rv->X[ir->rs2]; })
 
-    /* C.EBREAK */
-    _(cebreak, {
-        rv->io.on_ebreak(rv);
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* C.JALR */
-    _(cjalr, {
-        /* Unconditional jump and store PC+2 to ra */
-        const int32_t jump_to = rv->X[ir->rs1];
-        rv->X[rv_reg_ra] = rv->PC + ir->insn_len;
-        rv->PC = jump_to;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* C.ADD adds the values in registers rd and rs2 and writes the
-     * result to register rd.
-     * C.ADD expands into add rd, rd, rs2.
-     * C.ADD is only valid when rs2=x0; the code points with rs2=x0
-     * correspond to the C.JALR and C.EBREAK instructions. The code
-     * points with rs2=x0 and rd=x0 are HINTs.
-     */
-    _(cadd, rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2];)
-
-    /* C.SWSP */
-    _(cswsp, {
-        const uint32_t addr = rv->X[2] + ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
-#endif /* RV32_HAS(EXT_C) */
+/* C.EBREAK */
+RVOP(cebreak, {
+    rv->compressed = true;
+    rv->io.on_ebreak(rv);
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
+
+/* C.JALR */
+RVOP(cjalr, {
+    /* Unconditional jump and store PC+2 to ra */
+    const int32_t jump_to = rv->X[ir->rs1];
+    rv->X[rv_reg_ra] = rv->PC + ir->insn_len;
+    rv->PC = jump_to;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+    /* can branch */
+    rv->csr_cycle++;
+    return true;
+})
+
+/* C.ADD adds the values in registers rd and rs2 and writes the
+ * result to register rd.
+ * C.ADD expands into add rd, rd, rs2.
+ * C.ADD is only valid when rs2=x0; the code points with rs2=x0
+ * correspond to the C.JALR and C.EBREAK instructions. The code
+ * points with rs2=x0 and rd=x0 are HINTs.
+ */
+RVOP(cadd, { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; })
+
+/* C.SWSP */
+RVOP(cswsp, {
+    const uint32_t addr = rv->X[2] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
+#endif
 
+static const void *dispatch_table[] = {
+#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
+    RISCV_INSN_LIST
 #undef _
-
-    EPILOGUE()
-}
+};
 
 static bool insn_is_branch(uint8_t opcode)
 {
     switch (opcode) {
-    case rv_insn_jal:
-    case rv_insn_jalr:
-    case rv_insn_beq:
-    case rv_insn_bne:
-    case rv_insn_blt:
-    case rv_insn_bge:
-    case rv_insn_bltu:
-    case rv_insn_bgeu:
-    case rv_insn_ecall:
-    case rv_insn_ebreak:
-    case rv_insn_mret:
-#if RV32_HAS(EXT_C)
-    case rv_insn_cj:
-    case rv_insn_cjr:
-    case rv_insn_cjal:
-    case rv_insn_cjalr:
-    case rv_insn_cbeqz:
-    case rv_insn_cbnez:
-    case rv_insn_cebreak:
-#endif
-#if RV32_HAS(Zifencei)
-    case rv_insn_fencei:
-#endif
-        return true;
+#define _(inst, can_branch) \
+    IIF(can_branch)(case rv_insn_##inst : return true;, )
+        RISCV_INSN_LIST
+#undef _
     }
     return false;
 }
@@ -1446,7 +1373,7 @@ static void block_translate(riscv_t *rv, block_t *block)
             rv_except_illegal_insn(rv, insn);
             break;
         }
-
+        ir->impl = dispatch_table[ir->opcode];
         /* compute the end of pc */
         block->pc_end += ir->insn_len;
         block->n_insn++;
@@ -1455,6 +1382,7 @@ static void block_translate(riscv_t *rv, block_t *block)
         if (insn_is_branch(ir->opcode))
             break;
     }
+    block->ir[block->n_insn - 1].tailcall = true;
 }
 
 static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
@@ -1517,7 +1445,8 @@ void rv_step(riscv_t *rv, int32_t cycles)
         assert(block);
 
         /* execute the block */
-        if (!emulate(rv, block))
+        const rv_insn_t *ir = block->ir;
+        if (unlikely(!(ir->impl)(rv, ir)))
             break;
 
         prev = block;