From fba7a7dda416cfa21da1b8bdf3d32f97a9a1cd80 Mon Sep 17 00:00:00 2001 From: YenFuChen Date: Mon, 12 Dec 2022 23:16:37 +0800 Subject: [PATCH] Use TCO of C compiler to speed up emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to modify the function emulate into a recursive version for meeting the requirement of tail-call optimization(TCO). To achieve this, I add a variable is_tail to the struct rv_insn_t to help us determine whether the basic block is terminate or not. As a result, we can use this variable to rewrite function emulate into a self-recursive function. Running coremark and dhrystone benchmark now produces faster results than it did previously, and the test results show below. Test environment: 2020 M1 MacBook Pro 13-inch Coremark test result: Previous: 655.631123 Iterations/Sec Now: 791.928093 Iterations/Sec Dhrystone test result: Previous: 712 DMIPS Now: 851 DMIPS Previously, when the function emulate terminated, it returned to function block_emulate because the previous calling route was rv_step -> block_emulate -> emulate. So, each time the function emulate was called, a function stack frame was created. However, function emulate can now use the same function stack because of TCO. That is, any instructions in a basic block can execute function emulate by using the same function stack frame and we save the overhead of creating function stack frame. --- Makefile | 1 + src/decode.h | 1 + src/emulate.c | 80 +++++++++++++++++++++++++++------------------------ 3 files changed, 44 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 4d87da712..87c59e250 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ OUT ?= build BIN := $(OUT)/rv32emu CFLAGS = -std=gnu99 -O2 -Wall -Wextra +CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector CFLAGS += -include src/common.h # Set the default stack pointer diff --git a/src/decode.h b/src/decode.h index 63f134944..6ca17a69d 100644 --- a/src/decode.h +++ b/src/decode.h @@ -240,6 +240,7 @@ typedef struct { /* instruction length */ uint8_t insn_len; + bool is_tail; } rv_insn_t; /* translated basic block */ diff --git a/src/emulate.c b/src/emulate.c index 995e3b6f1..ef3023e34 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -256,6 +256,7 @@ static bool insn_is_misaligned(uint32_t pc) static bool emulate(riscv_t *rv, const rv_insn_t *ir) { + rv->X[rv_reg_zero] = 0; switch (ir->opcode) { /* RV32I Base Instruction Set */ case rv_insn_lui: /* LUI: Load Upper Immediate */ @@ -292,6 +293,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_jalr: { /* JALR: Jump and Link Register */ @@ -315,6 +317,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_beq: { /* BEQ: Branch if Equal */ @@ -328,6 +331,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -343,6 +347,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -358,6 +363,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -373,6 +379,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -388,6 +395,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -403,6 +411,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -473,14 +482,14 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) * result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler * pseudo-instruction. */ - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; + rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) + ir->imm; break; case rv_insn_slti: /* SLTI: Set on Less Than Immediate */ /* Place the value 1 in register rd if register rs1 is less than the * signextended immediate when both are treated as signed numbers, else * 0 is written to rd. */ - rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; + rv->X[ir->rd] = ((int32_t)(rv->X[ir->rs1]) < ir->imm) ? 1 : 0; break; case rv_insn_sltiu: /* SLTIU: Set on Less Than Immediate Unsigned */ /* Place the value 1 in register rd if register rs1 is less than the @@ -520,17 +529,17 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f); break; case rv_insn_add: /* ADD */ - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); + rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) + (int32_t)(rv->X[ir->rs2]); break; case rv_insn_sub: /* SUB: Substract */ - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); + rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) - (int32_t)(rv->X[ir->rs2]); break; case rv_insn_sll: /* SLL: Shift Left Logical */ rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); break; case rv_insn_slt: /* SLT: Set on Less Than */ rv->X[ir->rd] = - ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; + ((int32_t)(rv->X[ir->rs1]) < (int32_t)(rv->X[ir->rs2])) ? 1 : 0; break; case rv_insn_sltu: /* SLTU: Set on Less Than Unsigned */ rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; @@ -553,10 +562,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_ecall: /* ECALL: Environment Call */ rv->compressed = false; rv->io.on_ecall(rv); + rv->csr_cycle++; return true; case rv_insn_ebreak: /* EBREAK: Environment Break */ rv->compressed = false; rv->io.on_ebreak(rv); + rv->csr_cycle++; return true; case rv_insn_wfi: /* WFI: Wait for Interrupt */ case rv_insn_uret: /* URET: return from traps in U-mode */ @@ -567,6 +578,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_mret: /* MRET: return from traps in U-mode */ rv->PC = rv->csr_mepc; /* this is a branch */ + rv->csr_cycle++; return true; #if RV32_HAS(Zifencei) @@ -620,13 +632,13 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_mulh: { /* MULH: Multiply High Signed Signed */ const int64_t a = (int32_t) rv->X[ir->rs1], b = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; + rv->X[ir->rd] = ((uint64_t)(a * b)) >> 32; break; } case rv_insn_mulhsu: { /* MULHSU: Multiply High Signed Unsigned */ const int64_t a = (int32_t) rv->X[ir->rs1]; const uint64_t b = rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; + rv->X[ir->rd] = ((uint64_t)(a * b)) >> 32; break; } case rv_insn_mulhu: /* MULHU: Multiply High Unsigned Unsigned */ @@ -636,10 +648,11 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_div: { /* DIV: Divide Signed */ const int32_t dividend = (int32_t) rv->X[ir->rs1]; const int32_t divisor = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? ~0U - : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? rv->X[ir->rs1] /* overflow */ - : (unsigned int) (dividend / divisor); + rv->X[ir->rd] = !divisor + ? ~0U + : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) + ? rv->X[ir->rs1] /* overflow */ + : (unsigned int) (dividend / divisor); break; } case rv_insn_divu: { /* DIVU: Divide Unsigned */ @@ -649,10 +662,11 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) } case rv_insn_rem: { /* REM: Remainder Signed */ const int32_t dividend = rv->X[ir->rs1], divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? dividend - : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? 0 /* overflow */ - : (dividend % divisor); + rv->X[ir->rd] = !divisor + ? dividend + : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) + ? 0 /* overflow */ + : (dividend % divisor); break; } case rv_insn_remu: { /* REMU: Remainder Unsigned */ @@ -986,6 +1000,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cli: /* C.LI */ /* C.LI loads the sign-extended 6-bit immediate, imm, into register rd. @@ -1064,6 +1079,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cbeqz: /* C.BEQZ */ /* BEQZ performs conditional control transfers. The offset is @@ -1074,10 +1090,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) */ rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cbnez: /* C.BEQZ */ rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cslli: /* C.SLLI */ /* C.SLLI is a CI-format instruction that performs a logical left shift @@ -1100,6 +1118,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_cjr: /* C.JR */ rv->PC = rv->X[ir->rs1]; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cmv: /* C.MV */ rv->X[ir->rd] = rv->X[ir->rs2]; @@ -1108,6 +1127,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) rv->compressed = true; rv->io.on_ebreak(rv); /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cjalr: { /* C.JALR */ /* Unconditional jump and store PC+2 to ra */ @@ -1120,6 +1140,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_cadd: /* C.ADD */ @@ -1147,7 +1168,10 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) /* step over instruction */ rv->PC += ir->insn_len; - return true; + rv->csr_cycle++; + if (ir->is_tail) + return true; + return emulate(rv, ir + 1); } static bool insn_is_branch(uint8_t opcode) @@ -1240,27 +1264,6 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr) return NULL; } -/* execute a basic block */ -static bool block_emulate(riscv_t *rv, const block_t *block) -{ - const uint32_t n_insn = block->n_insn; - const rv_insn_t *ir = block->ir; - - /* execute the block */ - for (uint32_t i = 0; i < n_insn; i++) { - /* enforce zero register */ - rv->X[rv_reg_zero] = 0; - - /* execute the instruction */ - if (!emulate(rv, ir + i)) - return false; - - /* increment the cycles csr */ - rv->csr_cycle++; - } - return true; -} - static void block_translate(riscv_t *rv, block_t *block) { block->pc_start = block->pc_end = rv->PC; @@ -1288,6 +1291,7 @@ static void block_translate(riscv_t *rv, block_t *block) if (insn_is_branch(ir->opcode)) break; } + (block->ir + block->n_insn - 1)->is_tail = true; } static block_t *block_find_or_translate(riscv_t *rv, block_t *prev) @@ -1350,7 +1354,7 @@ void rv_step(riscv_t *rv, int32_t cycles) assert(block); /* execute the block */ - if (!block_emulate(rv, block)) + if (!emulate(rv, block->ir)) break; prev = block;