From 81674be7692e01ca372f7d251a9fafced9e79bb0 Mon Sep 17 00:00:00 2001 From: YenFuChen <48278026+qwe661234@users.noreply.github.com> Date: Wed, 21 Dec 2022 01:29:29 +0800 Subject: [PATCH] Reduce instruction dispatch by tail-call elimination (#95) To meet the tail-call optimization requirement, we must convert the function emulate into a recursive version (TCO). To accomplish this, we add a variable tailcall to the struct rv_insn_t to assist us in determining whether or not the basic block is terminated. As a result, we can rewrite function emulate into a self-recursive function using this variable. However, after performing performance analysis, we discovered that the emulator required a significant amount of time to calculate the jumping address. As a result, we stick with the wasm3 implementation, which separates all instruction emulations, and modify struct rv_insn_t so that we can directly assign instruction emulation to IR by adding member impl. CoreMark results: | Model | Compiler | f2da162 | PR #95 | Speedup | |--------------+----------+---------+---------+---------| | Core i7-8700 | clang-15 | 836.484 | 971.951 | +13.9% | |--------------+----------+---------+---------+---------| | Core i7-8700 | gcc-12 | 888.342 | 963.336 | +7.8% | |--------------+----------+---------+---------+---------| | eMAG 8180 | clang-15 | 286.000 | 335.396 | +20.5% | |--------------+----------+-------------------+---------| | eMAG 8180 | gcc-12 | 259.638 | 332.561 | +14.0% | Previously, when function "emulate" terminated, it returned to function "block_emulate" because the previous calling sequence was rv_step -> block_emulate -> emulate -> block_emulate -> emulate -> ... As a result, a function stack frame was created each time function "emulate" was invoked. In addition, the jumping address had to be calculated using a method such as switch-case, computed-goto in function "emulate". However, because we can now invoke instruction emulation directly and the current calling route is rv_step -> instruction emulation -> instruction emulation -> ... The instruction emulation an now use the same function stack frame due to TCO. That is, any instruction in a basic block can emulate a function by using the same function stack frame, saving the overhead of creating function stack frames. --- Makefile | 5 + src/common.h | 10 + src/decode.h | 272 +++---- src/emulate.c | 1948 +++++++++++++++++++++++-------------------------- 4 files changed, 1071 insertions(+), 1164 deletions(-) diff --git a/Makefile b/Makefile index 4d87da712..4a91433ee 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ OUT ?= build BIN := $(OUT)/rv32emu CFLAGS = -std=gnu99 -O2 -Wall -Wextra +CFLAGS += -Wno-unused-label CFLAGS += -include src/common.h # Set the default stack pointer @@ -88,6 +89,10 @@ gdbstub-test: $(BIN) $(Q)tests/gdbstub.sh && $(call notice, [OK]) endif +# For tail-call elimination, we need a specific set of build flags applied. +# FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact. +$(OUT)/emulate.o: CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector + # Clear the .DEFAULT_GOAL special variable, so that the following turns # to the first target after .DEFAULT_GOAL is not set. .DEFAULT_GOAL := diff --git a/src/common.h b/src/common.h index 348764125..e59816f3d 100644 --- a/src/common.h +++ b/src/common.h @@ -24,6 +24,16 @@ #define __ALIGNED(x) #endif +/* There is no tail-call optimization(TCO) in non-optimized builds. To work + * around this, we attempts to use a compiler attribute called musttail that + * forces the compiler to TCO even when optimizations aren't on. + */ +#if defined(__has_attribute) && __has_attribute(musttail) +#define MUST_TAIL __attribute__((musttail)) +#else +#define MUST_TAIL +#endif + /* Pattern Matching for C macros. * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms */ diff --git a/src/decode.h b/src/decode.h index 084f41309..4a9c102fe 100644 --- a/src/decode.h +++ b/src/decode.h @@ -8,158 +8,160 @@ #include #include -/* RISC-V instruction list */ +#include "riscv.h" + +/* RISC-V instruction list in format _(instruction-name, can-branch) */ /* clang-format off */ #define RISCV_INSN_LIST \ - _(nop) \ + _(nop, 0) \ /* RV32I Base Instruction Set */ \ - _(lui) \ - _(auipc) \ - _(jal) \ - _(jalr) \ - _(beq) \ - _(bne) \ - _(blt) \ - _(bge) \ - _(bltu) \ - _(bgeu) \ - _(lb) \ - _(lh) \ - _(lw) \ - _(lbu) \ - _(lhu) \ - _(sb) \ - _(sh) \ - _(sw) \ - _(addi) \ - _(slti) \ - _(sltiu) \ - _(xori) \ - _(ori) \ - _(andi) \ - _(slli) \ - _(srli) \ - _(srai) \ - _(add) \ - _(sub) \ - _(sll) \ - _(slt) \ - _(sltu) \ - _(xor) \ - _(srl) \ - _(sra) \ - _(or) \ - _(and) \ - _(ecall) \ - _(ebreak) \ + _(lui, 0) \ + _(auipc, 0) \ + _(jal, 1) \ + _(jalr, 1) \ + _(beq, 1) \ + _(bne, 1) \ + _(blt, 1) \ + _(bge, 1) \ + _(bltu, 1) \ + _(bgeu, 1) \ + _(lb, 0) \ + _(lh, 0) \ + _(lw, 0) \ + _(lbu, 0) \ + _(lhu, 0) \ + _(sb, 0) \ + _(sh, 0) \ + _(sw, 0) \ + _(addi, 0) \ + _(slti, 0) \ + _(sltiu, 0) \ + _(xori, 0) \ + _(ori, 0) \ + _(andi, 0) \ + _(slli, 0) \ + _(srli, 0) \ + _(srai, 0) \ + _(add, 0) \ + _(sub, 0) \ + _(sll, 0) \ + _(slt, 0) \ + _(sltu, 0) \ + _(xor, 0) \ + _(srl, 0) \ + _(sra, 0) \ + _(or, 0) \ + _(and, 0) \ + _(ecall, 1) \ + _(ebreak, 1) \ /* RISC-V Privileged Instruction */ \ - _(wfi) \ - _(uret) \ - _(sret) \ - _(hret) \ - _(mret) \ + _(wfi, 0) \ + _(uret, 0) \ + _(sret, 0) \ + _(hret, 0) \ + _(mret, 1) \ /* RV32 Zifencei Standard Extension */ \ IIF(RV32_HAS(Zifencei))( \ - _(fencei) \ + _(fencei, 0) \ ) \ /* RV32 Zicsr Standard Extension */ \ IIF(RV32_HAS(Zicsr))( \ - _(csrrw) \ - _(csrrs) \ - _(csrrc) \ - _(csrrwi) \ - _(csrrsi) \ - _(csrrci) \ + _(csrrw, 0) \ + _(csrrs, 0) \ + _(csrrc, 0) \ + _(csrrwi, 0) \ + _(csrrsi, 0) \ + _(csrrci, 0) \ ) \ /* RV32M Standard Extension */ \ IIF(RV32_HAS(EXT_M))( \ - _(mul) \ - _(mulh) \ - _(mulhsu) \ - _(mulhu) \ - _(div) \ - _(divu) \ - _(rem) \ - _(remu) \ + _(mul, 0) \ + _(mulh, 0) \ + _(mulhsu, 0) \ + _(mulhu, 0) \ + _(div, 0) \ + _(divu, 0) \ + _(rem, 0) \ + _(remu, 0) \ ) \ /* RV32A Standard Extension */ \ IIF(RV32_HAS(EXT_A))( \ - _(lrw) \ - _(scw) \ - _(amoswapw) \ - _(amoaddw) \ - _(amoxorw) \ - _(amoandw) \ - _(amoorw) \ - _(amominw) \ - _(amomaxw) \ - _(amominuw) \ - _(amomaxuw) \ + _(lrw, 0) \ + _(scw, 0) \ + _(amoswapw, 0) \ + _(amoaddw, 0) \ + _(amoxorw, 0) \ + _(amoandw, 0) \ + _(amoorw, 0) \ + _(amominw, 0) \ + _(amomaxw, 0) \ + _(amominuw, 0) \ + _(amomaxuw, 0) \ ) \ /* RV32F Standard Extension */ \ IIF(RV32_HAS(EXT_F))( \ - _(flw) \ - _(fsw) \ - _(fmadds) \ - _(fmsubs) \ - _(fnmsubs) \ - _(fnmadds) \ - _(fadds) \ - _(fsubs) \ - _(fmuls) \ - _(fdivs) \ - _(fsqrts) \ - _(fsgnjs) \ - _(fsgnjns) \ - _(fsgnjxs) \ - _(fmins) \ - _(fmaxs) \ - _(fcvtws) \ - _(fcvtwus) \ - _(fmvxw) \ - _(feqs) \ - _(flts) \ - _(fles) \ - _(fclasss) \ - _(fcvtsw) \ - _(fcvtswu) \ - _(fmvwx) \ + _(flw, 0) \ + _(fsw, 0) \ + _(fmadds, 0) \ + _(fmsubs, 0) \ + _(fnmsubs, 0) \ + _(fnmadds, 0) \ + _(fadds, 0) \ + _(fsubs, 0) \ + _(fmuls, 0) \ + _(fdivs, 0) \ + _(fsqrts, 0) \ + _(fsgnjs, 0) \ + _(fsgnjns, 0) \ + _(fsgnjxs, 0) \ + _(fmins, 0) \ + _(fmaxs, 0) \ + _(fcvtws, 0) \ + _(fcvtwus, 0) \ + _(fmvxw, 0) \ + _(feqs, 0) \ + _(flts, 0) \ + _(fles, 0) \ + _(fclasss, 0) \ + _(fcvtsw, 0) \ + _(fcvtswu, 0) \ + _(fmvwx, 0) \ ) \ /* RV32C Standard Extension */ \ IIF(RV32_HAS(EXT_C))( \ - _(caddi4spn) \ - _(clw) \ - _(csw) \ - _(cnop) \ - _(caddi) \ - _(cjal) \ - _(cli) \ - _(caddi16sp) \ - _(clui) \ - _(csrli) \ - _(csrai) \ - _(candi) \ - _(csub) \ - _(cxor) \ - _(cor) \ - _(cand) \ - _(cj) \ - _(cbeqz) \ - _(cbnez) \ - _(cslli) \ - _(clwsp) \ - _(cjr) \ - _(cmv) \ - _(cebreak) \ - _(cjalr) \ - _(cadd) \ - _(cswsp) \ + _(caddi4spn, 0) \ + _(clw, 0) \ + _(csw, 0) \ + _(cnop, 0) \ + _(caddi, 0) \ + _(cjal, 1) \ + _(cli, 0) \ + _(caddi16sp, 0) \ + _(clui, 0) \ + _(csrli, 0) \ + _(csrai, 0) \ + _(candi, 0) \ + _(csub, 0) \ + _(cxor, 0) \ + _(cor, 0) \ + _(cand, 0) \ + _(cj, 1) \ + _(cbeqz, 1) \ + _(cbnez, 1) \ + _(cslli, 0) \ + _(clwsp, 0) \ + _(cjr, 1) \ + _(cmv, 0) \ + _(cebreak, 1) \ + _(cjalr, 1) \ + _(cadd, 0) \ + _(cswsp, 0) \ ) /* clang-format on */ /* IR list */ enum { -#define _(inst) rv_insn_##inst, +#define _(inst, can_branch) rv_insn_##inst, RISCV_INSN_LIST #undef _ }; @@ -226,7 +228,7 @@ enum { INSN_32 = 4, }; -typedef struct { +typedef struct rv_insn { union { int32_t imm; uint8_t rs3; @@ -241,6 +243,22 @@ typedef struct { /* instruction length */ uint8_t insn_len; + + /* According to tail-call optimization (TCO), if a C function ends with + * a function call to another function or itself and simply returns that + * function's result, the compiler can substitute a simple jump to the + * other function for the 'call' and 'return' instructions . The self + * -recursive function can therefore use the same function stack frame. + * + * Using member tailcall, we can tell whether an IR is the final IR in + * a basic block. Additionally, member 'impl' allows us to invoke next + * instruction emulation directly without computing the jumping address. + * In order to enable the compiler to perform TCO, we can use these two + * members to rewrite all instruction emulations into a self-recursive + * version. + */ + bool tailcall; + bool (*impl)(riscv_t *, const struct rv_insn *); } rv_insn_t; /* decode the RISC-V instruction */ diff --git a/src/emulate.c b/src/emulate.c index 737fbf8e9..352487f98 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -254,1119 +254,991 @@ static inline bool insn_is_misaligned(uint32_t pc) ); } -/* execute a basic block */ -static bool emulate(riscv_t *rv, const block_t *block) -{ -#if RV32_HAS(COMPUTED_GOTO) - static const void *dispatch_table[] = { -#define _(inst) [rv_insn_##inst] = &&do_##inst, - RISCV_INSN_LIST +/* can-branch information for each RISC-V instruction */ +enum { +#define _(inst, can_branch) __rv_insn_##inst##_canbranch = can_branch, + RISCV_INSN_LIST #undef _ - }; - -#define DISPATCH() \ - /* enforce zero register */ \ - rv->X[rv_reg_zero] = 0; \ - /* current IR */ \ - ir = block->ir + index++; \ - /* jump */ \ - goto *dispatch_table[ir->opcode]; - -/* clang-format off */ -#define _(inst, code) \ - do_##inst: code \ - /* step over instruction */ \ - rv->PC += ir->insn_len; \ - /* increment the cycles CSR */ \ - rv->csr_cycle++; \ - /* all instructions have executed */ \ - if (unlikely(index == n_insn)) \ - return true; \ - DISPATCH() -/* clang-format on */ -#define EPILOGUE() - -#else /* !RV32_HAS(COMPUTED_GOTO) */ -#define DISPATCH() \ - for (uint32_t i = 0; i < n_insn; i++) { \ - ir = block->ir + i; \ - /* enforce zero register */ \ - rv->X[rv_reg_zero] = 0; \ - switch (ir->opcode) { -/* clang-format off */ -#define _(inst, code) \ - case rv_insn_##inst: code \ - break; -#define EPILOGUE() \ - } \ - /* step over instruction */ \ - rv->PC += ir->insn_len; \ - /* increment the cycles csr */ \ - rv->csr_cycle++; \ - } \ - return true; -/* clang-format on */ -#endif /* RV32_HAS(COMPUTED_GOTO) */ - - const uint32_t n_insn = block->n_insn; - rv_insn_t *ir; - -#if RV32_HAS(COMPUTED_GOTO) - /* current index in block */ - uint32_t index = 0; -#endif - - /* main loop */ - DISPATCH() - - /* Internal */ - _(nop, /* no operation */) - - /* LUI (Load Upper Immediate) is used to build 32-bit constants and uses the - * U-type format. LUI places the U-immediate value in the top 20 bits of the - * destination register rd, filling in the lowest 12 bits with zeros. The - * 32-bit result is sign-extended to 64 bits. - */ - _(lui, rv->X[ir->rd] = ir->imm;) - - /* AUIPC (Add Upper Immediate to PC) is used to build pc-relative addresses - * and uses the U-type format. AUIPC forms a 32-bit offset from the 20-bit - * U-immediate, filling in the lowest 12 bits with zeros, adds this offset - * to the address of the AUIPC instruction, then places the result in - * register rd. - */ - _(auipc, rv->X[ir->rd] = ir->imm + rv->PC;) +}; - /* JAL: Jump and Link - * store successor instruction address into rd. - * add next J imm (offset) to pc. - */ - _(jal, { - const uint32_t pc = rv->PC; - /* Jump */ - rv->PC += ir->imm; - /* link with return address */ - if (ir->rd) - rv->X[ir->rd] = pc + ir->insn_len; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) - - /* JALR: Jump and Link Register - * The indirect jump instruction JALR uses the I-type encoding. The - * target address is obtained by adding the sign-extended 12-bit - * I-immediate to the register rs1, then setting the least-significant - * bit of the result to zero. The address of the instruction following - * the jump (pc+4) is written to register rd. Register x0 can be used as - * the destination if the result is not required. - */ - _(jalr, { - const uint32_t pc = rv->PC; - /* jump */ - rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U; - /* link */ - if (ir->rd) - rv->X[ir->rd] = pc + ir->insn_len; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) - - /* BEQ: Branch if Equal */ - _(beq, { - const uint32_t pc = rv->PC; - if (rv->X[ir->rs1] == rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* BNE: Branch if Not Equal */ - _(bne, { - const uint32_t pc = rv->PC; - if (rv->X[ir->rs1] != rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* BLT: Branch if Less Than */ - _(blt, { - const uint32_t pc = rv->PC; - if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* BGE: Branch if Greater Than */ - _(bge, { - const uint32_t pc = rv->PC; - if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* BLTU: Branch if Less Than Unsigned */ - _(bltu, { - const uint32_t pc = rv->PC; - if (rv->X[ir->rs1] < rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* BGEU: Branch if Greater Than Unsigned */ - _(bgeu, { - const uint32_t pc = rv->PC; - if (rv->X[ir->rs1] >= rv->X[ir->rs2]) { - rv->PC += ir->imm; - /* check instruction misaligned */ - if (unlikely(insn_is_misaligned(rv->PC))) { - rv->compressed = false; - rv_except_insn_misaligned(rv, pc); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - } - }) - - /* LB: Load Byte */ - _(lb, { - rv->X[ir->rd] = - sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm)); - }) - - /* LH: Load Halfword */ - _(lh, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - if (unlikely(addr & 1)) { - rv->compressed = false; - rv_except_load_misaligned(rv, addr); - return false; - } - rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr)); - }) - - /* LW: Load Word */ - _(lw, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - if (unlikely(addr & 3)) { - rv->compressed = false; - rv_except_load_misaligned(rv, addr); - return false; - } - rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); - }) - - /* LBU: Load Byte Unsigned */ - _(lbu, rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm);) - - /* LHU: Load Halfword Unsigned */ - _(lhu, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - if (unlikely(addr & 1)) { - rv->compressed = false; - rv_except_load_misaligned(rv, addr); - return false; - } - rv->X[ir->rd] = rv->io.mem_read_s(rv, addr); - }) - - /* SB: Store Byte */ - _(sb, rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]);) - - /* SH: Store Halfword */ - _(sh, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - if (unlikely(addr & 1)) { - rv->compressed = false; - rv_except_store_misaligned(rv, addr); - return false; - } - rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]); - }) - - /* SW: Store Word */ - _(sw, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - if (unlikely(addr & 3)) { - rv->compressed = false; - rv_except_store_misaligned(rv, addr); - return false; - } - rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); - }) +#define RVOP(inst, code) \ + static bool do_##inst(riscv_t *rv UNUSED, const rv_insn_t *ir UNUSED) \ + { \ + rv->X[rv_reg_zero] = 0; \ + code; \ + if (__rv_insn_##inst##_canbranch) { \ + /* can branch */ \ + rv->csr_cycle++; \ + return true; \ + } \ + nextop: \ + rv->PC += ir->insn_len; \ + if (ir->tailcall) \ + return true; \ + const rv_insn_t *next = ir + 1; \ + MUST_TAIL return next->impl(rv, next); \ + } - /* ADDI (Add Immediate) adds the sign-extended 12-bit immediate to register - * rs1. Arithmetic overflow is ignored and the result is simply the low XLEN - * bits of the result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 - * assembler pseudo-instruction. - */ - _(addi, rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm;) +/* RV32I Base Instruction Set */ - /* SLTI (Set on Less Than Immediate) places the value 1 in register rd if - * register rs1 is less than the signextended immediate when both are - * treated as signed numbers, else 0 is written to rd. - */ - _(slti, rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0;) +/* Internal */ +RVOP(nop, {/* no operation */}); - /* SLTIU (Set on Less Than Immediate Unsigned) places the value 1 in - * register rd if register rs1 is less than the immediate when both are - * treated as unsigned numbers, else 0 is written to rd. - */ - _(sltiu, rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0;) +/* LUI is used to build 32-bit constants and uses the U-type format. LUI + * places the U-immediate value in the top 20 bits of the destination + * register rd, filling in the lowest 12 bits with zeros. The 32-bit + * result is sign-extended to 64 bits. + */ +RVOP(lui, { rv->X[ir->rd] = ir->imm; }) - /* XORI: Exclusive OR Immediate */ - _(xori, rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm;) +/* AUIPC is used to build pc-relative addresses and uses the U-type + * format. AUIPC forms a 32-bit offset from the 20-bit U-immediate, + * filling in the lowest 12 bits with zeros, adds this offset to the + * address of the AUIPC instruction, then places the result in register + * rd. + */ +RVOP(auipc, { rv->X[ir->rd] = ir->imm + rv->PC; }) - /* ORI: OR Immediate */ - _(ori, rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm;) +/* JAL: Jump and Link + * store successor instruction address into rd. + * add next J imm (offset) to pc. + */ +RVOP(jal, { + const uint32_t pc = rv->PC; + /* Jump */ + rv->PC += ir->imm; + /* link with return address */ + if (ir->rd) + rv->X[ir->rd] = pc + ir->insn_len; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/*The indirect jump instruction JALR uses the I-type encoding. The + * target address is obtained by adding the sign-extended 12-bit + * I-immediate to the register rs1, then setting the least-significant + * bit of the result to zero. The address of the instruction following + * the jump (pc+4) is written to register rd. Register x0 can be used as + * the destination if the result is not required. + */ +RVOP(jalr, { + const uint32_t pc = rv->PC; + /* jump */ + rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U; + /* link */ + if (ir->rd) + rv->X[ir->rd] = pc + ir->insn_len; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BEQ: Branch if Equal */ +RVOP(beq, { + const uint32_t pc = rv->PC; + if (rv->X[ir->rs1] != rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BNE: Branch if Not Equal */ +RVOP(bne, { + const uint32_t pc = rv->PC; + if (rv->X[ir->rs1] == rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BLT: Branch if Less Than */ +RVOP(blt, { + const uint32_t pc = rv->PC; + if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BGE: Branch if Greater Than */ +RVOP(bge, { + const uint32_t pc = rv->PC; + if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BLTU: Branch if Less Than Unsigned */ +RVOP(bltu, { + const uint32_t pc = rv->PC; + if (rv->X[ir->rs1] >= rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* BGEU: Branch if Greater Than Unsigned */ +RVOP(bgeu, { + const uint32_t pc = rv->PC; + if (rv->X[ir->rs1] < rv->X[ir->rs2]) + goto nextop; + rv->PC += ir->imm; + /* check instruction misaligned */ + if (unlikely(insn_is_misaligned(rv->PC))) { + rv->compressed = false; + rv_except_insn_misaligned(rv, pc); + return false; + } +}) + +/* LB: Load Byte */ +RVOP(lb, { + rv->X[ir->rd] = + sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm)); +}) + +/* LH: Load Halfword */ +RVOP(lh, { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + if (unlikely(addr & 1)) { + rv->compressed = false; + rv_except_load_misaligned(rv, addr); + return false; + } + rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr)); +}) - /* ANDI (AND Immediate) performs bitwise AND on register rs1 and the - * sign-extended 12-bit immediate and place the result in rd. - */ - _(andi, rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm;) +/* LW: Load Word */ +RVOP(lw, { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + if (unlikely(addr & 3)) { + rv->compressed = false; + rv_except_load_misaligned(rv, addr); + return false; + } + rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); +}) - /* SLLI (Shift Left Logical) performs logical left shift on the value in - * register rs1 by the shift amount held in the lower 5 bits of the - * immediate. - */ - _(slli, rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f);) +/* LBU: Load Byte Unsigned */ +RVOP(lbu, { rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm); }) - /* SRLI (Shift Right Logical) performs logical right shift on the value in - * register rs1 by the shift amount held in the lower 5 bits of the - * immediate. - */ - _(srli, rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f);) +/* LHU: Load Halfword Unsigned */ +RVOP(lhu, { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + if (unlikely(addr & 1)) { + rv->compressed = false; + rv_except_load_misaligned(rv, addr); + return false; + } + rv->X[ir->rd] = rv->io.mem_read_s(rv, addr); +}) - /* SRAI (Shift Right Arithmetic) performs arithmetic right shift on the - * value in register rs1 by the shift amount held in the lower 5 bits of the - * immediate. - */ - _(srai, rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f);) +/* SB: Store Byte */ +RVOP(sb, { rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); }) - /* ADD */ - _(add, - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);) +/* SH: Store Halfword */ +RVOP(sh, { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + if (unlikely(addr & 1)) { + rv->compressed = false; + rv_except_store_misaligned(rv, addr); + return false; + } + rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]); +}) - /* SUB: Substract */ - _(sub, - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);) +/* SW: Store Word */ +RVOP(sw, { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + if (unlikely(addr & 3)) { + rv->compressed = false; + rv_except_store_misaligned(rv, addr); + return false; + } + rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); +}) - /* SLL: Shift Left Logical */ - _(sll, rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f);) +/* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic + * overflow is ignored and the result is simply the low XLEN bits of the + * result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler + * pseudo-instruction. + */ +RVOP(addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }) - /* SLT: Set on Less Than */ - _(slt, { - rv->X[ir->rd] = - ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; - }) +/* SLTI place the value 1 in register rd if register rs1 is less than the + * signextended immediate when both are treated as signed numbers, else + * 0 is written to rd. + */ +RVOP(slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; }) - /* SLTU: Set on Less Than Unsigned */ - _(sltu, rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0;) +/* SLTIU places the value 1 in register rd if register rs1 is less than the + * immediate when both are treated as unsigned numbers, else 0 is + * written to rd. + */ +RVOP(sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; }) - /* XOR: Exclusive OR */ - _(xor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];) +/* XORI: Exclusive OR Immediate */ +RVOP(xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; }) - /* SRL: Shift Right Logical */ - _(srl, rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f);) +/* ORI: OR Immediate */ +RVOP(ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; }) - /* SRA: Shift Right Arithmetic */ - _(sra, { - rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); - }) +/* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit + * immediate and place the result in rd. + */ +RVOP(andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; }) - /* OR */ - _(or, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];) +/* SLLI performs logical left shift on the value in register rs1 by the shift + * amount held in the lower 5 bits of the immediate. + */ +RVOP(slli, { rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f); }) - /* AND */ - _(and, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];) +/* SRLI performs logical right shift on the value in register rs1 by the + * shift amount held in the lower 5 bits of the immediate. + */ +RVOP(srli, { rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f); }) - /* ECALL: Environment Call */ - _(ecall, { - rv->compressed = false; - rv->io.on_ecall(rv); /* increment the cycles csr */ - rv->csr_cycle++; - return true; - }) +/* SRAI performs arithmetic right shift on the value in register rs1 by + * the shift amount held in the lower 5 bits of the immediate. + */ +RVOP(srai, { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f); }) + +/* ADD */ +RVOP(add, { + rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); +}) + +/* SUB: Substract */ +RVOP(sub, { + rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); +}) + +/* SLL: Shift Left Logical */ +RVOP(sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); }) + +/* SLT: Set on Less Than */ +RVOP(slt, { + rv->X[ir->rd] = + ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; +}) + +/* SLTU: Set on Less Than Unsigned */ +RVOP(sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; }) + +/* XOR: Exclusive OR */ +RVOP(xor, { + rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; +}) + +/* SRL: Shift Right Logical */ +RVOP(srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); }) + +/* SRA: Shift Right Arithmetic */ +RVOP(sra, + { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); }) + +/* OR */ +RVOP(or, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }) + +/* AND */ +RVOP(and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }) + +/* ECALL: Environment Call */ +RVOP(ecall, { + rv->compressed = false; + rv->io.on_ecall(rv); +}) + +/* EBREAK: Environment Break */ +RVOP(ebreak, { + rv->compressed = false; + rv->io.on_ebreak(rv); +}) + +/* WFI: Wait for Interrupt */ +RVOP(wfi, { + /* FIXME: Implement */ + return false; +}) - /* EBREAK: Environment Break */ - _(ebreak, { - rv->compressed = false; - rv->io.on_ebreak(rv); /* increment the cycles csr */ - rv->csr_cycle++; - return true; - }) +/* URET: return from traps in U-mode */ +RVOP(uret, { + /* FIXME: Implement */ + return false; +}) - /* WFI: Wait for Interrupt */ - _(wfi, return false;) +/* SRET: return from traps in S-mode */ +RVOP(sret, { + /* FIXME: Implement */ + return false; +}) - /* URET: return from traps in U-mode */ - _(uret, return false;) +/* HRET: return from traps in H-mode */ +RVOP(hret, { + /* FIXME: Implement */ + return false; +}) - /* SRET: return from traps in S-mode */ - _(sret, return false;) +/* MRET: return from traps in U-mode */ +RVOP(mret, { rv->PC = rv->csr_mepc; }) - /* HRET: return from traps in H-mode */ - _(hret, return false;) +#if RV32_HAS(Zifencei) /* RV32 Zifencei Standard Extension */ +RVOP(fencei, + { + /* FIXME: fill real implementations */ + }) +#endif - /* MRET: return from traps in U-mode */ - _(mret, { - rv->PC = rv->csr_mepc; - /* increment the cycles csr */ - rv->csr_cycle++; - /* this is a branch */ - return true; - }) +#if RV32_HAS(Zicsr) /* RV32 Zicsr Standard Extension */ +/* CSRRW: Atomic Read/Write CSR */ +RVOP(csrrw, { + uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) + +/* CSRRS: Atomic Read and Set Bits in CSR */ +RVOP(csrrs, { + uint32_t tmp = + csr_csrrs(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) + +/* CSRRC: Atomic Read and Clear Bits in CSR */ +RVOP(csrrc, { + uint32_t tmp = + csr_csrrc(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) + +/* CSRRWI */ +RVOP(csrrwi, { + uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) + +/* CSRRSI */ +RVOP(csrrsi, { + uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) + +/* CSRRCI */ +RVOP(csrrci, { + uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; +}) +#endif - /* RV32 Zifencei Standard Extension */ -#if RV32_HAS(Zifencei) - _(fencei, /* FIXME: fill real implementations */); +#if RV32_HAS(EXT_M) /* RV32M Standard Extension */ +/* MUL: Multiply */ +RVOP(mul, + { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; }) + +/* MULH: Multiply High Signed Signed */ +RVOP(mulh, { + const int64_t a = (int32_t) rv->X[ir->rs1]; + const int64_t b = (int32_t) rv->X[ir->rs2]; + rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; +}) + +/* MULHSU: Multiply High Signed Unsigned */ +RVOP(mulhsu, { + const int64_t a = (int32_t) rv->X[ir->rs1]; + const uint64_t b = rv->X[ir->rs2]; + rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; +}) + +/* MULHU: Multiply High Unsigned Unsigned */ +RVOP(mulhu, { + rv->X[ir->rd] = + ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; +}) + +/* DIV: Divide Signed */ +RVOP(div, { + const int32_t dividend = (int32_t) rv->X[ir->rs1]; + const int32_t divisor = (int32_t) rv->X[ir->rs2]; + rv->X[ir->rd] = !divisor ? ~0U + : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) + ? rv->X[ir->rs1] /* overflow */ + : (unsigned int) (dividend / divisor); +}) + +/* DIVU: Divide Unsigned */ +RVOP(divu, { + const uint32_t dividend = rv->X[ir->rs1]; + const uint32_t divisor = rv->X[ir->rs2]; + rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor; +}) + +/* REM: Remainder Signed */ +RVOP(rem, { + const int32_t dividend = rv->X[ir->rs1]; + const int32_t divisor = rv->X[ir->rs2]; + rv->X[ir->rd] = !divisor ? dividend + : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) + ? 0 /* overflow */ + : (dividend % divisor); +}) + +/* REMU: Remainder Unsigned */ +RVOP(remu, { + const uint32_t dividend = rv->X[ir->rs1]; + const uint32_t divisor = rv->X[ir->rs2]; + rv->X[ir->rd] = !divisor ? dividend : dividend % divisor; +}) #endif - /* RV32 Zicsr Standard Extension */ -#if RV32_HAS(Zicsr) - /* CSRRW: Atomic Read/Write CSR */ - _(csrrw, { - uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) - - /* CSRRS: Atomic Read and Set Bits in CSR */ - _(csrrs, { - uint32_t tmp = csr_csrrs( - rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) - - /* CSRRC: Atomic Read and Clear Bits in CSR */ - _(csrrc, { - uint32_t tmp = csr_csrrc( - rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) - - /* CSRRWI */ - _(csrrwi, { - uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) - - /* CSRRSI */ - _(csrrsi, { - uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) - - /* CSRRCI */ - _(csrrci, { - uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; - }) -#endif /* RV32_HAS(Zicsr) */ - - /* RV32M Standard Extension */ -#if RV32_HAS(EXT_M) - /* MUL: Multiply */ - _(mul, rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2];) - - /* MULH: Multiply High Signed Signed */ - _(mulh, { - const int64_t a = (int32_t) rv->X[ir->rs1]; - const int64_t b = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; - }) - - /* MULHSU: Multiply High Signed Unsigned */ - _(mulhsu, { - const int64_t a = (int32_t) rv->X[ir->rs1]; - const uint64_t b = rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; - }) - - /* MULHU: Multiply High Unsigned Unsigned */ - _(mulhu, { - rv->X[ir->rd] = - ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; - }) - - /* DIV: Divide Signed */ - _(div, { - const int32_t dividend = (int32_t) rv->X[ir->rs1]; - const int32_t divisor = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? ~0U - : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? rv->X[ir->rs1] /* overflow */ - : (unsigned int) (dividend / divisor); - }) - - /* DIVU: Divide Unsigned */ - _(divu, { - const uint32_t dividend = rv->X[ir->rs1]; - const uint32_t divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor; - }) - - /* REM: Remainder Signed */ - _(rem, { - const int32_t dividend = rv->X[ir->rs1]; - const int32_t divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? dividend - : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? 0 /* overflow */ - : (dividend % divisor); - }) - - /* REMU: Remainder Unsigned */ - _(remu, { - const uint32_t dividend = rv->X[ir->rs1]; - const uint32_t divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? dividend : dividend % divisor; - }) -#endif /* RV32_HAS(EXT_M) */ - - /* RV32A Standard Extension - * At present, AMO is not implemented atomically because the emulated - * RISC-V core just runs on single thread, and no out-of-order execution - * happens. In addition, rl/aq are not handled. +#if RV32_HAS(EXT_A) /* RV32A Standard Extension */ +/* At present, AMO is not implemented atomically because the rvop_jump_table[(ir + * + 1)->opcode]d RISC-V core just runs on single thread, and no out-of-order + * execution happens. In addition, rl/aq are not handled. + */ + +/* LR.W: Load Reserved */ +RVOP(lrw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]); + /* skip registration of the 'reservation set' + * FIXME: uimplemented */ -#if RV32_HAS(EXT_A) - /* LR.W: Load Reserved */ - _(lrw, { - /* skip registration of the 'reservation set' - * FIXME: uimplemented - */ - rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]); - }) +}) - /* SC.W: Store Conditional */ - _(scw, { - /* assume the 'reservation set' is valid - * FIXME: unimplemented - */ - rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]); - rv->X[ir->rd] = 0; - }) - - /* AMOSWAP.W: Atomic Swap */ - _(amoswapw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]); - }) - - /* AMOADD.W: Atomic ADD */ - _(amoaddw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2]; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOXOR.W: Atomix XOR */ - _(amoxorw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2]; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOAND.W: Atomic AND */ - _(amoandw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2]; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOOR.W: Atomic OR */ - _(amoorw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2]; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOMIN.W: Atomic MIN */ - _(amominw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t a = rv->X[ir->rd]; - const int32_t b = rv->X[ir->rs2]; - const int32_t res = a < b ? a : b; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOMAX.W: Atomic MAX */ - _(amomaxw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const int32_t a = rv->X[ir->rd]; - const int32_t b = rv->X[ir->rs2]; - const int32_t res = a > b ? a : b; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOMINU.W */ - _(amominuw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const uint32_t a = rv->X[ir->rd]; - const uint32_t b = rv->X[ir->rs2]; - const uint32_t res = a < b ? a : b; - rv->io.mem_write_s(rv, ir->rs1, res); - }) - - /* AMOMAXU.W */ - _(amomaxuw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); - const uint32_t a = rv->X[ir->rd]; - const uint32_t b = rv->X[ir->rs2]; - const uint32_t res = a > b ? a : b; - rv->io.mem_write_s(rv, ir->rs1, res); - }) +/* SC.W: Store Conditional */ +RVOP(scw, { + /* assume the 'reservation set' is valid + * FIXME: unimplemented + */ + rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]); + rv->X[ir->rd] = 0; +}) + +/* AMOSWAP.W: Atomic Swap */ +RVOP(amoswapw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]); +}) + +/* AMOADD.W: Atomic ADD */ +RVOP(amoaddw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2]; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOXOR.W: Atomix XOR */ +RVOP(amoxorw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2]; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOAND.W: Atomic AND */ +RVOP(amoandw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2]; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOOR.W: Atomic OR */ +RVOP(amoorw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2]; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOMIN.W: Atomic MIN */ +RVOP(amominw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t a = rv->X[ir->rd]; + const int32_t b = rv->X[ir->rs2]; + const int32_t res = a < b ? a : b; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOMAX.W: Atomic MAX */ +RVOP(amomaxw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const int32_t a = rv->X[ir->rd]; + const int32_t b = rv->X[ir->rs2]; + const int32_t res = a > b ? a : b; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOMINU.W */ +RVOP(amominuw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const uint32_t a = rv->X[ir->rd]; + const uint32_t b = rv->X[ir->rs2]; + const uint32_t res = a < b ? a : b; + rv->io.mem_write_s(rv, ir->rs1, res); +}) + +/* AMOMAXU.W */ +RVOP(amomaxuw, { + rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1); + const uint32_t a = rv->X[ir->rd]; + const uint32_t b = rv->X[ir->rs2]; + const uint32_t res = a > b ? a : b; + rv->io.mem_write_s(rv, ir->rs1, res); +}) #endif /* RV32_HAS(EXT_A) */ - /* RV32F Standard Extension */ -#if RV32_HAS(EXT_F) - /* FLW */ - _(flw, { - /* copy into the float register */ - const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm); - memcpy(rv->F + ir->rd, &data, 4); - }) - - /* FSW */ - _(fsw, { - /* copy from float registers */ - uint32_t data; - memcpy(&data, (const void *) (rv->F + ir->rs2), 4); - rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data); - }) - - /* FMADD.S */ - _(fmadds, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3];) - - /* FMSUB.S */ - _(fmsubs, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3];) - - /* FNMSUB.S */ - _(fnmsubs, - rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]);) - - /* FNMADD.S */ - _(fnmadds, - rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3];) - - /* FADD.S */ - _(fadds, { - if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) || - isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) { - /* raise invalid operation */ - rv->F_int[ir->rd] = RV_NAN; - /* F_int is the integer shortcut of F */ +#if RV32_HAS(EXT_F) /* RV32F Standard Extension */ +/* FLW */ +RVOP(flw, { + /* copy into the float register */ + const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm); + memcpy(rv->F + ir->rd, &data, 4); +}) + +/* FSW */ +RVOP(fsw, { + /* copy from float registers */ + uint32_t data; + memcpy(&data, (const void *) (rv->F + ir->rs2), 4); + rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data); +}) + +/* FMADD.S */ +RVOP(fmadds, + { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3]; }) + +/* FMSUB.S */ +RVOP(fmsubs, + { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3]; }) + +/* FNMSUB.S */ +RVOP(fnmsubs, + { rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]); }) + +/* FNMADD.S */ +RVOP(fnmadds, + { rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3]; }) + +/* FADD.S */ +RVOP(fadds, { + if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) || + isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) { + /* raise invalid operation */ + rv->F_int[ir->rd] = RV_NAN; /* F_int is the integer shortcut of F */ + rv->csr_fcsr |= FFLAG_INVALID_OP; + } else { + rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2]; + } + if (isinff(rv->F[ir->rd])) { + rv->csr_fcsr |= FFLAG_OVERFLOW; + rv->csr_fcsr |= FFLAG_INEXACT; + } +}) + +/* FSUB.S */ +RVOP(fsubs, { + if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) { + rv->F_int[ir->rd] = RV_NAN; + } else { + rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2]; + } +}) + +/* FMUL.S */ +RVOP(fmuls, { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2]; }) + +/* FDIV.S */ +RVOP(fdivs, { rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2]; }) + +/* FSQRT.S */ +RVOP(fsqrts, { rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]); }) + +/* FSGNJ.S */ +RVOP(fsgnjs, { + uint32_t f1; + uint32_t f2; + uint32_t res; + memcpy(&f1, rv->F + ir->rs1, 4); + memcpy(&f2, rv->F + ir->rs2, 4); + res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN); + memcpy(rv->F + ir->rd, &res, 4); +}) + +/* FSGNJN.S */ +RVOP(fsgnjns, { + uint32_t f1; + uint32_t f2; + uint32_t res; + memcpy(&f1, rv->F + ir->rs1, 4); + memcpy(&f2, rv->F + ir->rs2, 4); + res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN); + memcpy(rv->F + ir->rd, &res, 4); +}) + +/* FSGNJX.S */ +RVOP(fsgnjxs, { + uint32_t f1; + uint32_t f2; + uint32_t res; + memcpy(&f1, rv->F + ir->rs1, 4); + memcpy(&f2, rv->F + ir->rs2, 4); + res = f1 ^ (f2 & FMASK_SIGN); + memcpy(rv->F + ir->rd, &res, 4); +}) + +/* FMIN.S + * In IEEE754-201x, fmin(x, y) return + * - min(x,y) if both numbers are not NaN + * - if one is NaN and another is a number, return the number + * - if both are NaN, return NaN + * When input is signaling NaN, raise invalid operation + */ +RVOP(fmins, { + uint32_t x; + uint32_t y; + memcpy(&x, rv->F + ir->rs1, 4); + memcpy(&y, rv->F + ir->rs2, 4); + if (is_nan(x) || is_nan(y)) { + if (is_snan(x) || is_snan(y)) rv->csr_fcsr |= FFLAG_INVALID_OP; + if (is_nan(x) && !is_nan(y)) { + rv->F[ir->rd] = rv->F[ir->rs2]; + } else if (!is_nan(x) && is_nan(y)) { + rv->F[ir->rd] = rv->F[ir->rs1]; } else { - rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2]; - } - if (isinff(rv->F[ir->rd])) { - rv->csr_fcsr |= FFLAG_OVERFLOW; - rv->csr_fcsr |= FFLAG_INEXACT; - } - }) - - /* FSUB.S */ - _(fsubs, { - if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) { rv->F_int[ir->rd] = RV_NAN; + } + } else { + uint32_t a_sign; + uint32_t b_sign; + a_sign = x & FMASK_SIGN; + b_sign = y & FMASK_SIGN; + if (a_sign != b_sign) { + rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2]; } else { - rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2]; + rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? rv->F[ir->rs1] + : rv->F[ir->rs2]; } - }) - - /* FMUL.S */ - _(fmuls, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2];) - - /* FDIV.S */ - _(fdivs, rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2];) - - /* FSQRT.S */ - _(fsqrts, rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]);) - - /* FSGNJ.S */ - _(fsgnjs, { - uint32_t f1; - uint32_t f2; - memcpy(&f1, rv->F + ir->rs1, 4); - memcpy(&f2, rv->F + ir->rs2, 4); - uint32_t res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN); - memcpy(rv->F + ir->rd, &res, 4); - }) - - /* FSGNJN.S */ - _(fsgnjns, { - uint32_t f1; - uint32_t f2; - memcpy(&f1, rv->F + ir->rs1, 4); - memcpy(&f2, rv->F + ir->rs2, 4); - uint32_t res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN); - memcpy(rv->F + ir->rd, &res, 4); - }) - - /* FSGNJX.S */ - _(fsgnjxs, { - uint32_t f1; - uint32_t f2; - uint32_t res; - memcpy(&f1, rv->F + ir->rs1, 4); - memcpy(&f2, rv->F + ir->rs2, 4); - res = f1 ^ (f2 & FMASK_SIGN); - memcpy(rv->F + ir->rd, &res, 4); - }) - - /* FMIN.S */ - _(fmins, { - /* In IEEE754-201x, fmin(x, y) return - * - min(x,y) if both numbers are not NaN - * - if one is NaN and another is a number, return the number - * - if both are NaN, return NaN - * When input is signaling NaN, raise invalid operation - */ - uint32_t x; - uint32_t y; - memcpy(&x, rv->F + ir->rs1, 4); - memcpy(&y, rv->F + ir->rs2, 4); - if (is_nan(x) || is_nan(y)) { - if (is_snan(x) || is_snan(y)) - rv->csr_fcsr |= FFLAG_INVALID_OP; - if (is_nan(x) && !is_nan(y)) { - rv->F[ir->rd] = rv->F[ir->rs2]; - } else if (!is_nan(x) && is_nan(y)) { - rv->F[ir->rd] = rv->F[ir->rs1]; - } else { - rv->F_int[ir->rd] = RV_NAN; - } + } +}) + +/* FMAX.S */ +RVOP(fmaxs, { + uint32_t x; + uint32_t y; + memcpy(&x, rv->F + ir->rs1, 4); + memcpy(&y, rv->F + ir->rs2, 4); + if (is_nan(x) || is_nan(y)) { + if (is_snan(x) || is_snan(y)) + rv->csr_fcsr |= FFLAG_INVALID_OP; + if (is_nan(x) && !is_nan(y)) { + rv->F[ir->rd] = rv->F[ir->rs2]; + } else if (!is_nan(x) && is_nan(y)) { + rv->F[ir->rd] = rv->F[ir->rs1]; } else { - uint32_t a_sign; - uint32_t b_sign; - a_sign = x & FMASK_SIGN; - b_sign = y & FMASK_SIGN; - if (a_sign != b_sign) { - rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2]; - } else { - rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) - ? rv->F[ir->rs1] - : rv->F[ir->rs2]; - } + rv->F_int[ir->rd] = RV_NAN; } - }) - - /* FMAX.S */ - _(fmaxs, { - uint32_t x; - uint32_t y; - memcpy(&x, rv->F + ir->rs1, 4); - memcpy(&y, rv->F + ir->rs2, 4); - if (is_nan(x) || is_nan(y)) { - if (is_snan(x) || is_snan(y)) - rv->csr_fcsr |= FFLAG_INVALID_OP; - if (is_nan(x) && !is_nan(y)) { - rv->F[ir->rd] = rv->F[ir->rs2]; - } else if (!is_nan(x) && is_nan(y)) { - rv->F[ir->rd] = rv->F[ir->rs1]; - } else { - rv->F_int[ir->rd] = RV_NAN; - } + } else { + uint32_t a_sign; + uint32_t b_sign; + a_sign = x & FMASK_SIGN; + b_sign = y & FMASK_SIGN; + if (a_sign != b_sign) { + rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1]; } else { - uint32_t a_sign; - uint32_t b_sign; - a_sign = x & FMASK_SIGN; - b_sign = y & FMASK_SIGN; - if (a_sign != b_sign) { - rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1]; - } else { - rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2]) - ? rv->F[ir->rs1] - : rv->F[ir->rs2]; - } + rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2]) ? rv->F[ir->rs1] + : rv->F[ir->rs2]; } - }) + } +}) - /* FCVT.W.S */ - _(fcvtws, rv->X[ir->rd] = (int32_t) rv->F[ir->rs1];) +/* FCVT.W.S */ +RVOP(fcvtws, { rv->X[ir->rd] = (int32_t) rv->F[ir->rs1]; }) - /* FCVT.WU.S */ - _(fcvtwus, rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1];) +/* FCVT.WU.S */ +RVOP(fcvtwus, { rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1]; }) - /* FMV.X.W */ - _(fmvxw, memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4);) +/* FMV.X.W */ +RVOP(fmvxw, { memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4); }) - /* FEQ.S performs a quiet comparison: it only sets the invalid - * operation exception flag if either input is a signaling NaN. - */ - _(feqs, rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0;) +/* FEQ.S performs a quiet comparison: it only sets the invalid + * operation exception flag if either input is a signaling NaN. + */ +RVOP(feqs, { rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0; }) - /* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers - * to as signaling comparisons: that is, they set the invalid - * operation exception flag if either input is NaN. - */ - _(flts, rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0;) +/* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers + * to as signaling comparisons: that is, they set the invalid + * operation exception flag if either input is NaN. + */ +RVOP(flts, { rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0; }) - /* FLE.S */ - _(fles, rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0;) +RVOP(fles, { rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0; }) - /* FCLASS.S */ - _(fclasss, { - uint32_t bits; - memcpy(&bits, rv->F + ir->rs1, 4); - rv->X[ir->rd] = calc_fclass(bits); - }) +/* FCLASS.S */ +RVOP(fclasss, { + uint32_t bits; + memcpy(&bits, rv->F + ir->rs1, 4); + rv->X[ir->rd] = calc_fclass(bits); +}) - /* FCVT.S.W */ - _(fcvtsw, rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1];) +/* FCVT.S.W */ +RVOP(fcvtsw, { rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1]; }) - /* FCVT.S.WU */ - _(fcvtswu, rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1];) +/* FCVT.S.WU */ +RVOP(fcvtswu, { rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1]; }) - /* FMV.W.X */ - _(fmvwx, memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4);) -#endif /* RV32_HAS(EXT_F) */ +/* FMV.W.X */ +RVOP(fmvwx, { memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4); }) +#endif - /* RV32C Standard Extension */ -#if RV32_HAS(EXT_C) - /* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended - * non-zero immediate, scaledby 4, to the stack pointer, x2, and - * writes the result to rd'. This instruction is used to generate - * pointers to stack-allocated variables, and expands to addi rd', - * x2, nzuimm[9:2]. - */ - _(caddi4spn, rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm;) +#if RV32_HAS(EXT_C) /* RV32C Standard Extension */ +/* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended + * non-zero immediate, scaledby 4, to the stack pointer, x2, and writes + * the result to rd'. This instruction is used to generate pointers to + * stack-allocated variables, and expands to addi rd', x2, nzuimm[9:2]. + */ +RVOP(caddi4spn, { rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm; }) - /* C.LW loads a 32-bit value from memory into register rd'. It - * computes an ffective address by adding the zero-extended offset, - * scaled by 4, to the base address in register rs1'. It expands to - * # lw rd', offset[6:2](rs1'). - */ - _(clw, { - const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; - if (addr & 3) { - rv->compressed = true; - rv_except_load_misaligned(rv, addr); - return false; - } - rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); - }) +/* C.LW loads a 32-bit value from memory into register rd'. It computes + * an ffective address by adding the zero-extended offset, scaled by 4, + * to the base address in register rs1'. It expands to # lw rd', + * offset[6:2](rs1'). + */ +RVOP(clw, { + const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; + if (unlikely(addr & 3)) { + rv->compressed = true; + rv_except_load_misaligned(rv, addr); + return false; + } + rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); +}) - /* C.SW stores a 32-bit value in register rs2' to memory. It computes - * an effective address by adding the zero-extended offset, scaled by - * 4, to the base address in register rs1'. - * It expands to sw rs2', offset[6:2](rs1') - */ - _(csw, { - const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; - if (addr & 3) { - rv->compressed = true; - rv_except_store_misaligned(rv, addr); - return false; - } - rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); - }) - - /* C.NOP */ - _(cnop, /* nothing */) - - /* C.ADDI adds the non-zero sign-extended 6-bit immediate to the - * value in register rd then writes the result to rd. C.ADDI expands - * into addi rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0. - * The code point with both rd=x0 and nzimm=0 encodes the C.NOP - * instruction; the remaining code points with either rd=x0 or - * nzimm=0 encode HINTs. - */ - _(caddi, rv->X[ir->rd] += (int16_t) ir->imm;) - - /* C.JAL */ - _(cjal, { - rv->X[1] = rv->PC + ir->insn_len; - rv->PC += ir->imm; - if (rv->PC & 0x1) { - rv->compressed = true; - rv_except_insn_misaligned(rv, rv->PC); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) +/* C.SW stores a 32-bit value in register rs2' to memory. It computes an + * effective address by adding the zero-extended offset, scaled by 4, to + * the base address in register rs1'. + * It expands to sw rs2', offset[6:2](rs1') + */ +RVOP(csw, { + const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; + if (unlikely(addr & 3)) { + rv->compressed = true; + rv_except_store_misaligned(rv, addr); + return false; + } + rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); +}) - /* C.LI loads the sign-extended 6-bit immediate, imm, into - * register rd. - * C.LI expands into addi rd, x0, imm[5:0]. - * C.LI is only valid when rd=x0; the code points with rd=x0 encode - * HINTs. - */ - _(cli, rv->X[ir->rd] = ir->imm;) +/* C.NOP */ +RVOP(cnop, {/* no operation */}) - /* C.ADDI16SP is used to adjust the stack pointer in procedure - * prologues and epilogues. - * It expands into addi x2, x2, nzimm[9:4]. - * C.ADDI16SP is only valid when nzimm̸=0; the code point with - * nzimm=0 is reserved. - */ - _(caddi16sp, rv->X[ir->rd] += ir->imm;) - - /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of - * the destination register, clears the bottom 12 bits, and - * sign-extends bit 17 into all higher bits of the destination. - * C.LUI expands into lui rd, nzimm[17:12]. - * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is - * not equal to zero. - */ - _(clui, rv->X[ir->rd] = ir->imm;) +/* C.ADDI adds the non-zero sign-extended 6-bit immediate to the value + * in register rd then writes the result to rd. C.ADDI expands into addi + * rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0. The code point + * with both rd=x0 and nzimm=0 encodes the C.NOP instruction; the + * remaining code points with either rd=x0 or nzimm=0 encode HINTs. + */ +RVOP(caddi, { rv->X[ir->rd] += (int16_t) ir->imm; }) - /* C.SRLI is a CB-format instruction that performs a logical right - * shift of the value in register rd' then writes the result to rd'. - * The shift amount is encoded in the shamt field. C.SRLI expands - * into srli rd', rd', shamt[5:0]. - */ - _(csrli, rv->X[ir->rs1] >>= ir->shamt;) +/* C.JAL */ +RVOP(cjal, { + rv->X[1] = rv->PC + ir->insn_len; + rv->PC += ir->imm; + if (unlikely(rv->PC & 0x1)) { + rv->compressed = true; + rv_except_insn_misaligned(rv, rv->PC); + return false; + } +}) - /* C.SRAI is defined analogously to C.SRLI, but instead performs an - * arithmetic right shift. - * C.SRAI expands to srai rd', rd', shamt[5:0]. - */ - _(csrai, { - const uint32_t mask = 0x80000000 & rv->X[ir->rs1]; - rv->X[ir->rs1] >>= ir->shamt; - for (unsigned int i = 0; i < ir->shamt; ++i) - rv->X[ir->rs1] |= mask >> i; - }) - - /* C.ANDI is a CB-format instruction that computes the bitwise AND of - * the value in register rd' and the sign-extended 6-bit immediate, - * then writes the result to rd'. - * C.ANDI expands to andi rd', rd', imm[5:0]. - */ - _(candi, rv->X[ir->rs1] &= ir->imm;) +/* C.LI loads the sign-extended 6-bit immediate, imm, into register rd. + * C.LI expands into addi rd, x0, imm[5:0]. + * C.LI is only valid when rd=x0; the code points with rd=x0 encode + * HINTs. + */ +RVOP(cli, { rv->X[ir->rd] = ir->imm; }) - /* C.SUB */ - _(csub, rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2];) +/* C.ADDI16SP is used to adjust the stack pointer in procedure + * prologues and epilogues. + * It expands into addi x2, x2, nzimm[9:4]. + * C.ADDI16SP is only valid when nzimm̸=0; the code point with nzimm=0 + * is reserved. + */ +RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; }) + +/* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the + * destination register, clears the bottom 12 bits, and sign-extends bit + * 17 into all higher bits of the destination. + * C.LUI expands into lui rd, nzimm[17:12]. + * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is not + * equal to zero. + */ +RVOP(clui, { rv->X[ir->rd] = ir->imm; }) - /* C.XOR */ - _(cxor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];) +/* C.SRLI is a CB-format instruction that performs a logical right shift + * of the value in register rd' then writes the result to rd'. The shift + * amount is encoded in the shamt field. C.SRLI expands into srli rd', + * rd', shamt[5:0]. + */ +RVOP(csrli, { rv->X[ir->rs1] >>= ir->shamt; }) - /* C.OR */ - _(cor, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];) +/* C.SRAI is defined analogously to C.SRLI, but instead performs an + * arithmetic right shift. C.SRAI expands to srai rd', rd', shamt[5:0]. + */ +RVOP(csrai, { + const uint32_t mask = 0x80000000 & rv->X[ir->rs1]; + rv->X[ir->rs1] >>= ir->shamt; + for (unsigned int i = 0; i < ir->shamt; ++i) + rv->X[ir->rs1] |= mask >> i; +}) + +/* C.ANDI is a CB-format instruction that computes the bitwise AND of + * the value in register rd' and the sign-extended 6-bit immediate, then + * writes the result to rd'. C.ANDI expands to andi rd', rd', imm[5:0]. + */ +RVOP(candi, { rv->X[ir->rs1] &= ir->imm; }) - /* C.AND */ - _(cand, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];) +/* C.SUB */ +RVOP(csub, { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }) - /* C.J performs an unconditional control transfer. The offset is - * sign-extended and added to the pc to form the jump target address. - * C.J can therefore target a ±2 KiB range. - * C.J expands to jal x0, offset[11:1]. - */ - _(cj, { - rv->PC += ir->imm; - if (rv->PC & 0x1) { - rv->compressed = true; - rv_except_insn_misaligned(rv, rv->PC); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) +/* C.XOR */ +RVOP(cxor, { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }) - /* C.BEQZ performs conditional control transfers. The offset is - * sign-extended and added to the pc to form the branch target - * address. It can therefore target a ±256 B range. C.BEQZ takes the - * branch if the value in register rs1' is zero. - * It expands to beq rs1', x0, offset[8:1]. - */ - _(cbeqz, { - rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) +RVOP(cor, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }) - _(cbnez, { - rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) +RVOP(cand, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }) - /* C.SLLI is a CI-format instruction that performs a logical left - * shift of the value in register rd then writes the result to rd. - * The shift amount is encoded in the shamt field. - * C.SLLI expands into slli rd, rd, shamt[5:0]. - */ - _(cslli, rv->X[ir->rd] <<= (uint8_t) ir->imm;) - - /* C.LWSP */ - _(clwsp, { - const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; - if (addr & 3) { - rv->compressed = true; - rv_except_load_misaligned(rv, addr); - return false; - } - rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); - }) - - /* C.JR */ - _(cjr, { - rv->PC = rv->X[ir->rs1]; - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) +/* C.J performs an unconditional control transfer. The offset is + * sign-extended and added to the pc to form the jump target address. + * C.J can therefore target a ±2 KiB range. + * C.J expands to jal x0, offset[11:1]. + */ +RVOP(cj, { + rv->PC += ir->imm; + if (unlikely(rv->PC & 0x1)) { + rv->compressed = true; + rv_except_insn_misaligned(rv, rv->PC); + return false; + } +}) + +/* C.BEQZ performs conditional control transfers. The offset is + * sign-extended and added to the pc to form the branch target address. + * It can therefore target a ±256 B range. C.BEQZ takes the branch if + * the value in register rs1' is zero. It expands to beq rs1', x0, + * offset[8:1]. + */ +RVOP(cbeqz, + { rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; }) - /* C.MV */ - _(cmv, rv->X[ir->rd] = rv->X[ir->rs2];) +/* C.BEQZ */ +RVOP(cbnez, { rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; }) + +/* C.SLLI is a CI-format instruction that performs a logical left shift + * of the value in register rd then writes the result to rd. The shift + * amount is encoded in the shamt field. C.SLLI expands into slli rd, + * rd, shamt[5:0]. + */ +RVOP(cslli, { rv->X[ir->rd] <<= (uint8_t) ir->imm; }) - /* C.EBREAK */ - _(cebreak, { +/* C.LWSP */ +RVOP(clwsp, { + const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; + if (unlikely(addr & 3)) { rv->compressed = true; - rv->io.on_ebreak(rv); - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) - - /* C.JALR */ - _(cjalr, { - /* Unconditional jump and store PC+2 to ra */ - const int32_t jump_to = rv->X[ir->rs1]; - rv->X[rv_reg_ra] = rv->PC + ir->insn_len; - rv->PC = jump_to; - if (rv->PC & 0x1) { - rv->compressed = true; - rv_except_insn_misaligned(rv, rv->PC); - return false; - } - /* increment the cycles csr */ - rv->csr_cycle++; - /* can branch */ - return true; - }) - - /* C.ADD adds the values in registers rd and rs2 and writes the - * result to register rd. - * C.ADD expands into add rd, rd, rs2. - * C.ADD is only valid when rs2=x0; the code points with rs2=x0 - * correspond to the C.JALR and C.EBREAK instructions. The code - * points with rs2=x0 and rd=x0 are HINTs. - */ - _(cadd, rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2];) - - /* C.SWSP */ - _(cswsp, { - const uint32_t addr = rv->X[2] + ir->imm; - if (addr & 3) { - rv->compressed = true; - rv_except_store_misaligned(rv, addr); - return false; - } - rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); - }) -#endif /* RV32_HAS(EXT_C) */ + rv_except_load_misaligned(rv, addr); + return false; + } + rv->X[ir->rd] = rv->io.mem_read_w(rv, addr); +}) + +/* C.JR */ +RVOP(cjr, { rv->PC = rv->X[ir->rs1]; }) + +/* C.MV */ +RVOP(cmv, { rv->X[ir->rd] = rv->X[ir->rs2]; }) + +/* C.EBREAK */ +RVOP(cebreak, { + rv->compressed = true; + rv->io.on_ebreak(rv); +}) + +/* C.JALR */ +RVOP(cjalr, { + /* Unconditional jump and store PC+2 to ra */ + const int32_t jump_to = rv->X[ir->rs1]; + rv->X[rv_reg_ra] = rv->PC + ir->insn_len; + rv->PC = jump_to; + if (unlikely(rv->PC & 0x1)) { + rv->compressed = true; + rv_except_insn_misaligned(rv, rv->PC); + return false; + } +}) + +/* C.ADD adds the values in registers rd and rs2 and writes the + * result to register rd. + * C.ADD expands into add rd, rd, rs2. + * C.ADD is only valid when rs2=x0; the code points with rs2=x0 + * correspond to the C.JALR and C.EBREAK instructions. The code + * points with rs2=x0 and rd=x0 are HINTs. + */ +RVOP(cadd, { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }) -#undef _ +/* C.SWSP */ +RVOP(cswsp, { + const uint32_t addr = rv->X[2] + ir->imm; + if (unlikely(addr & 3)) { + rv->compressed = true; + rv_except_store_misaligned(rv, addr); + return false; + } + rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]); +}) +#endif - EPILOGUE() -} +static const void *dispatch_table[] = { +#define _(inst, can_branch) [rv_insn_##inst] = do_##inst, + RISCV_INSN_LIST +#undef _ +}; static bool insn_is_branch(uint8_t opcode) { switch (opcode) { - case rv_insn_jal: - case rv_insn_jalr: - case rv_insn_beq: - case rv_insn_bne: - case rv_insn_blt: - case rv_insn_bge: - case rv_insn_bltu: - case rv_insn_bgeu: - case rv_insn_ecall: - case rv_insn_ebreak: - case rv_insn_mret: -#if RV32_HAS(EXT_C) - case rv_insn_cj: - case rv_insn_cjr: - case rv_insn_cjal: - case rv_insn_cjalr: - case rv_insn_cbeqz: - case rv_insn_cbnez: - case rv_insn_cebreak: -#endif -#if RV32_HAS(Zifencei) - case rv_insn_fencei: -#endif +#define _(inst, can_branch) IIF(can_branch)(case rv_insn_##inst:, ) + RISCV_INSN_LIST +#undef _ return true; } return false; @@ -1449,7 +1321,7 @@ static void block_translate(riscv_t *rv, block_t *block) rv_except_illegal_insn(rv, insn); break; } - + ir->impl = dispatch_table[ir->opcode]; /* compute the end of pc */ block->pc_end += ir->insn_len; block->n_insn++; @@ -1458,6 +1330,7 @@ static void block_translate(riscv_t *rv, block_t *block) if (insn_is_branch(ir->opcode)) break; } + block->ir[block->n_insn - 1].tailcall = true; } static block_t *block_find_or_translate(riscv_t *rv, block_t *prev) @@ -1520,7 +1393,8 @@ void rv_step(riscv_t *rv, int32_t cycles) assert(block); /* execute the block */ - if (!emulate(rv, block)) + const rv_insn_t *ir = block->ir; + if (unlikely(!ir->impl(rv, ir))) break; prev = block;