Skip to content

Commit

Permalink
Use TCO of C compiler to speed up emulation
Browse files Browse the repository at this point in the history
We need to modify the function emulate into a recursive version for
meeting the requirement of tail-call optimization(TCO). To achieve this,
I add a variable is_tail to the struct rv_insn_t to help us determine
whether the basic block is terminate or not. As a result, we can use
this variable to rewrite function emulate into a self-recursive
function.

Running coremark and dhrystone benchmark now produces faster results
than it did previously, and the test results show below.

Test environment: 2020 M1 MacBook Pro 13-inch
Coremark test result:
Previous: 655.631123 Iterations/Sec
Now: 791.928093 Iterations/Sec
Dhrystone test result:
Previous: 712 DMIPS
Now: 851 DMIPS

Previously, when the function emulate terminated, it returned to
function block_emulate because the previous calling route was rv_step
-> block_emulate -> emulate. So, each time the function emulate was
called, a function stack frame was created. However, function emulate
can now use the same function stack because of TCO. That is, any
instructions in a basic block can execute function emulate by using the
same function stack frame and we save the overhead of creating
function stack frame.
  • Loading branch information
qwe661234 committed Dec 12, 2022
1 parent 285a988 commit fba7a7d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 38 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ OUT ?= build
BIN := $(OUT)/rv32emu

CFLAGS = -std=gnu99 -O2 -Wall -Wextra
CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector
CFLAGS += -include src/common.h

# Set the default stack pointer
Expand Down
1 change: 1 addition & 0 deletions src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ typedef struct {

/* instruction length */
uint8_t insn_len;
bool is_tail;
} rv_insn_t;

/* translated basic block */
Expand Down
80 changes: 42 additions & 38 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ static bool insn_is_misaligned(uint32_t pc)

static bool emulate(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
switch (ir->opcode) {
/* RV32I Base Instruction Set */
case rv_insn_lui: /* LUI: Load Upper Immediate */
Expand Down Expand Up @@ -292,6 +293,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_jalr: { /* JALR: Jump and Link Register */
Expand All @@ -315,6 +317,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_beq: { /* BEQ: Branch if Equal */
Expand All @@ -328,6 +331,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -343,6 +347,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -358,6 +363,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -373,6 +379,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -388,6 +395,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -403,6 +411,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand Down Expand Up @@ -473,14 +482,14 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
* result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler
* pseudo-instruction.
*/
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm;
rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) + ir->imm;
break;
case rv_insn_slti: /* SLTI: Set on Less Than Immediate */
/* Place the value 1 in register rd if register rs1 is less than the
* signextended immediate when both are treated as signed numbers, else
* 0 is written to rd.
*/
rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0;
rv->X[ir->rd] = ((int32_t)(rv->X[ir->rs1]) < ir->imm) ? 1 : 0;
break;
case rv_insn_sltiu: /* SLTIU: Set on Less Than Immediate Unsigned */
/* Place the value 1 in register rd if register rs1 is less than the
Expand Down Expand Up @@ -520,17 +529,17 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f);
break;
case rv_insn_add: /* ADD */
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);
rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) + (int32_t)(rv->X[ir->rs2]);
break;
case rv_insn_sub: /* SUB: Substract */
rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);
rv->X[ir->rd] = (int32_t)(rv->X[ir->rs1]) - (int32_t)(rv->X[ir->rs2]);
break;
case rv_insn_sll: /* SLL: Shift Left Logical */
rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f);
break;
case rv_insn_slt: /* SLT: Set on Less Than */
rv->X[ir->rd] =
((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0;
((int32_t)(rv->X[ir->rs1]) < (int32_t)(rv->X[ir->rs2])) ? 1 : 0;
break;
case rv_insn_sltu: /* SLTU: Set on Less Than Unsigned */
rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0;
Expand All @@ -553,10 +562,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_ecall: /* ECALL: Environment Call */
rv->compressed = false;
rv->io.on_ecall(rv);
rv->csr_cycle++;
return true;
case rv_insn_ebreak: /* EBREAK: Environment Break */
rv->compressed = false;
rv->io.on_ebreak(rv);
rv->csr_cycle++;
return true;
case rv_insn_wfi: /* WFI: Wait for Interrupt */
case rv_insn_uret: /* URET: return from traps in U-mode */
Expand All @@ -567,6 +578,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_mret: /* MRET: return from traps in U-mode */
rv->PC = rv->csr_mepc;
/* this is a branch */
rv->csr_cycle++;
return true;

#if RV32_HAS(Zifencei)
Expand Down Expand Up @@ -620,13 +632,13 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_mulh: { /* MULH: Multiply High Signed Signed */
const int64_t a = (int32_t) rv->X[ir->rs1],
b = (int32_t) rv->X[ir->rs2];
rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
rv->X[ir->rd] = ((uint64_t)(a * b)) >> 32;
break;
}
case rv_insn_mulhsu: { /* MULHSU: Multiply High Signed Unsigned */
const int64_t a = (int32_t) rv->X[ir->rs1];
const uint64_t b = rv->X[ir->rs2];
rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
rv->X[ir->rd] = ((uint64_t)(a * b)) >> 32;
break;
}
case rv_insn_mulhu: /* MULHU: Multiply High Unsigned Unsigned */
Expand All @@ -636,10 +648,11 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_div: { /* DIV: Divide Signed */
const int32_t dividend = (int32_t) rv->X[ir->rs1];
const int32_t divisor = (int32_t) rv->X[ir->rs2];
rv->X[ir->rd] = !divisor ? ~0U
: (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
? rv->X[ir->rs1] /* overflow */
: (unsigned int) (dividend / divisor);
rv->X[ir->rd] = !divisor
? ~0U
: (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
? rv->X[ir->rs1] /* overflow */
: (unsigned int) (dividend / divisor);
break;
}
case rv_insn_divu: { /* DIVU: Divide Unsigned */
Expand All @@ -649,10 +662,11 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
}
case rv_insn_rem: { /* REM: Remainder Signed */
const int32_t dividend = rv->X[ir->rs1], divisor = rv->X[ir->rs2];
rv->X[ir->rd] = !divisor ? dividend
: (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
? 0 /* overflow */
: (dividend % divisor);
rv->X[ir->rd] = !divisor
? dividend
: (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
? 0 /* overflow */
: (dividend % divisor);
break;
}
case rv_insn_remu: { /* REMU: Remainder Unsigned */
Expand Down Expand Up @@ -986,6 +1000,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cli: /* C.LI */
/* C.LI loads the sign-extended 6-bit immediate, imm, into register rd.
Expand Down Expand Up @@ -1064,6 +1079,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cbeqz: /* C.BEQZ */
/* BEQZ performs conditional control transfers. The offset is
Expand All @@ -1074,10 +1090,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
*/
rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cbnez: /* C.BEQZ */
rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cslli: /* C.SLLI */
/* C.SLLI is a CI-format instruction that performs a logical left shift
Expand All @@ -1100,6 +1118,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_cjr: /* C.JR */
rv->PC = rv->X[ir->rs1];
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cmv: /* C.MV */
rv->X[ir->rd] = rv->X[ir->rs2];
Expand All @@ -1108,6 +1127,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
rv->compressed = true;
rv->io.on_ebreak(rv);
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cjalr: { /* C.JALR */
/* Unconditional jump and store PC+2 to ra */
Expand All @@ -1120,6 +1140,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_cadd: /* C.ADD */
Expand Down Expand Up @@ -1147,7 +1168,10 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)

/* step over instruction */
rv->PC += ir->insn_len;
return true;
rv->csr_cycle++;
if (ir->is_tail)
return true;
return emulate(rv, ir + 1);
}

static bool insn_is_branch(uint8_t opcode)
Expand Down Expand Up @@ -1240,27 +1264,6 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr)
return NULL;
}

/* execute a basic block */
static bool block_emulate(riscv_t *rv, const block_t *block)
{
const uint32_t n_insn = block->n_insn;
const rv_insn_t *ir = block->ir;

/* execute the block */
for (uint32_t i = 0; i < n_insn; i++) {
/* enforce zero register */
rv->X[rv_reg_zero] = 0;

/* execute the instruction */
if (!emulate(rv, ir + i))
return false;

/* increment the cycles csr */
rv->csr_cycle++;
}
return true;
}

static void block_translate(riscv_t *rv, block_t *block)
{
block->pc_start = block->pc_end = rv->PC;
Expand Down Expand Up @@ -1288,6 +1291,7 @@ static void block_translate(riscv_t *rv, block_t *block)
if (insn_is_branch(ir->opcode))
break;
}
(block->ir + block->n_insn - 1)->is_tail = true;
}

static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
Expand Down Expand Up @@ -1350,7 +1354,7 @@ void rv_step(riscv_t *rv, int32_t cycles)
assert(block);

/* execute the block */
if (!block_emulate(rv, block))
if (!emulate(rv, block->ir))
break;

prev = block;
Expand Down

0 comments on commit fba7a7d

Please sign in to comment.