Skip to content

Commit

Permalink
Use TCO of C compiler to speed up emulation
Browse files Browse the repository at this point in the history
We need to modify the function emulate into a recursive version for
meeting the requirement of tail-call optimization(TCO). To achieve this,
I add a variable is_tail to the struct rv_insn_t to help us determine
whether the basic block is terminate or not. As a result, we can use
this variable to rewrite function emulate into a self-recursive
function.

Running coremark benchmark now produces faster results
than it did previously, and the test results show below.

Test environment1: Ubuntu Linux 20.04 on Intel(R) Core(TM) i7-8700 CPU
@ 3.20GHz
Compiler: gcc 9.4.0
Coremark test result:
Previous: 870.317612 Iterations/Sec
Now: 920.675364 Iterations/Sec
-----------------------------------------------------------------------
Test environment2: Ubuntu Linux 20.04 on Intel(R) Core(TM) i7-8700 CPU
@ 3.20GHz
Compiler: clang-16
Coremark test result:
Previous: 805.702322 Iterations/Sec
Now: 849.445119 Iterations/Sec
-----------------------------------------------------------------------
Test environment3: Ubuntu Linux on eMAG
Compiler: gcc 11.3.0
Coremark test result:
Previous: 311.436129 Iterations/Sec
Now: 313.900684 Iterations/Sec
-----------------------------------------------------------------------
Test environment4: Ubuntu Linux on eMAG
Compiler: clang-16
Coremark test result:
Previous: 273.265220 Iterations/Sec
Now: 297.255706 Iterations/Sec

Previously, when the function emulate terminated, it returned to
function block_emulate because the previous calling route was rv_step
-> block_emulate -> emulate -> block_emulate -> emulate -> ... .
So, each time the function emulate was called, a function stack frame
was created. However, the current calling route is rv_step -> emulate ->
emulate -> ..., so function emulate can now use the same function stack
frame because of TCO. That is, any instructions in a basic block can
execute function emulate by using the same function stack frame and save
the overhead of creating function stack frame.
  • Loading branch information
qwe661234 committed Dec 13, 2022
1 parent 285a988 commit 044cd7b
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 24 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ endif
$(call set-feature, COMPUTED_GOTO)
ifeq ($(call has, COMPUTED_GOTO), 1)
ifeq ("$(CC_IS_GCC)", "1")
$(OUT)/emulate.o: CFLAGS += -fno-gcse -fno-crossjumping
# For tail-call elimination, we need a specific set of build flags applied.
$(OUT)/emulate.o: CFLAGS += -fno-gcse -fno-crossjumping -fomit-frame-pointer -fno-stack-check -fno-stack-protector
endif
endif

Expand Down
10 changes: 10 additions & 0 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,13 @@
#define IIF_0(t, ...) __VA_ARGS__
/* run the 1st parameter */
#define IIF_1(t, ...) t

/* There is no Tail-call optimization(TCO) in non-optimized builds. To work
* around this, we attempts to use a compiler attribute called musttail that
* forces the compiler to tail-call-optimize even when optimizations aren't on.
*/
#if defined(__has_attribute) && __has_attribute(musttail)
#define MUST_TAIL __attribute__((musttail))
#else
#define MUST_TAIL
#endif
11 changes: 11 additions & 0 deletions src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,17 @@ typedef struct {

/* instruction length */
uint8_t insn_len;
/* According to Tail-call optimization (TCO), if a C function ends with
* a function call to another function or itself and simply returns that
* function's result, the compiler can substitute a simple jump to the
* other function for the 'call' and 'return' instructions . The self
* -recursive function can therefore use the same function stack frame.
*
* Member tailcall help us to determine whether the IR is last IR in
* a basic block. We can use this member to rewrite the function emulate
* into a self-recursive version, allowing the compiler to do TCO.
*/
bool tailcall;
} rv_insn_t;

/* translated basic block */
Expand Down
48 changes: 25 additions & 23 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ static bool insn_is_misaligned(uint32_t pc)

static bool emulate(riscv_t *rv, const rv_insn_t *ir)
{
rv->X[rv_reg_zero] = 0;
switch (ir->opcode) {
/* RV32I Base Instruction Set */
case rv_insn_lui: /* LUI: Load Upper Immediate */
Expand Down Expand Up @@ -292,6 +293,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_jalr: { /* JALR: Jump and Link Register */
Expand All @@ -315,6 +317,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_beq: { /* BEQ: Branch if Equal */
Expand All @@ -328,6 +331,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -343,6 +347,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -358,6 +363,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -373,6 +379,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -388,6 +395,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand All @@ -403,6 +411,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
break;
Expand Down Expand Up @@ -553,10 +562,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_ecall: /* ECALL: Environment Call */
rv->compressed = false;
rv->io.on_ecall(rv);
rv->csr_cycle++;
return true;
case rv_insn_ebreak: /* EBREAK: Environment Break */
rv->compressed = false;
rv->io.on_ebreak(rv);
rv->csr_cycle++;
return true;
case rv_insn_wfi: /* WFI: Wait for Interrupt */
case rv_insn_uret: /* URET: return from traps in U-mode */
Expand All @@ -567,6 +578,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_mret: /* MRET: return from traps in U-mode */
rv->PC = rv->csr_mepc;
/* this is a branch */
rv->csr_cycle++;
return true;

#if RV32_HAS(Zifencei)
Expand Down Expand Up @@ -986,6 +998,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cli: /* C.LI */
/* C.LI loads the sign-extended 6-bit immediate, imm, into register rd.
Expand Down Expand Up @@ -1064,6 +1077,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cbeqz: /* C.BEQZ */
/* BEQZ performs conditional control transfers. The offset is
Expand All @@ -1074,10 +1088,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
*/
rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cbnez: /* C.BEQZ */
rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cslli: /* C.SLLI */
/* C.SLLI is a CI-format instruction that performs a logical left shift
Expand All @@ -1100,6 +1116,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
case rv_insn_cjr: /* C.JR */
rv->PC = rv->X[ir->rs1];
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cmv: /* C.MV */
rv->X[ir->rd] = rv->X[ir->rs2];
Expand All @@ -1108,6 +1125,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
rv->compressed = true;
rv->io.on_ebreak(rv);
/* can branch */
rv->csr_cycle++;
return true;
case rv_insn_cjalr: { /* C.JALR */
/* Unconditional jump and store PC+2 to ra */
Expand All @@ -1120,6 +1138,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)
return false;
}
/* can branch */
rv->csr_cycle++;
return true;
}
case rv_insn_cadd: /* C.ADD */
Expand Down Expand Up @@ -1147,7 +1166,10 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir)

/* step over instruction */
rv->PC += ir->insn_len;
return true;
rv->csr_cycle++;
if (ir->tailcall)
return true;
MUST_TAIL return emulate(rv, ir + 1);
}

static bool insn_is_branch(uint8_t opcode)
Expand Down Expand Up @@ -1240,27 +1262,6 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr)
return NULL;
}

/* execute a basic block */
static bool block_emulate(riscv_t *rv, const block_t *block)
{
const uint32_t n_insn = block->n_insn;
const rv_insn_t *ir = block->ir;

/* execute the block */
for (uint32_t i = 0; i < n_insn; i++) {
/* enforce zero register */
rv->X[rv_reg_zero] = 0;

/* execute the instruction */
if (!emulate(rv, ir + i))
return false;

/* increment the cycles csr */
rv->csr_cycle++;
}
return true;
}

static void block_translate(riscv_t *rv, block_t *block)
{
block->pc_start = block->pc_end = rv->PC;
Expand Down Expand Up @@ -1288,6 +1289,7 @@ static void block_translate(riscv_t *rv, block_t *block)
if (insn_is_branch(ir->opcode))
break;
}
block->ir[block->n_insn - 1].tailcall = true;
}

static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
Expand Down Expand Up @@ -1350,7 +1352,7 @@ void rv_step(riscv_t *rv, int32_t cycles)
assert(block);

/* execute the block */
if (!block_emulate(rv, block))
if (!emulate(rv, block->ir))
break;

prev = block;
Expand Down

0 comments on commit 044cd7b

Please sign in to comment.