From 044cd7b0e7442070856ff277fcc056ffdadfb1b9 Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Tue, 13 Dec 2022 23:56:14 +0800 Subject: [PATCH] Use TCO of C compiler to speed up emulation We need to modify the function emulate into a recursive version for meeting the requirement of tail-call optimization(TCO). To achieve this, I add a variable is_tail to the struct rv_insn_t to help us determine whether the basic block is terminate or not. As a result, we can use this variable to rewrite function emulate into a self-recursive function. Running coremark benchmark now produces faster results than it did previously, and the test results show below. Test environment1: Ubuntu Linux 20.04 on Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz Compiler: gcc 9.4.0 Coremark test result: Previous: 870.317612 Iterations/Sec Now: 920.675364 Iterations/Sec ----------------------------------------------------------------------- Test environment2: Ubuntu Linux 20.04 on Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz Compiler: clang-16 Coremark test result: Previous: 805.702322 Iterations/Sec Now: 849.445119 Iterations/Sec ----------------------------------------------------------------------- Test environment3: Ubuntu Linux on eMAG Compiler: gcc 11.3.0 Coremark test result: Previous: 311.436129 Iterations/Sec Now: 313.900684 Iterations/Sec ----------------------------------------------------------------------- Test environment4: Ubuntu Linux on eMAG Compiler: clang-16 Coremark test result: Previous: 273.265220 Iterations/Sec Now: 297.255706 Iterations/Sec Previously, when the function emulate terminated, it returned to function block_emulate because the previous calling route was rv_step -> block_emulate -> emulate -> block_emulate -> emulate -> ... . So, each time the function emulate was called, a function stack frame was created. However, the current calling route is rv_step -> emulate -> emulate -> ..., so function emulate can now use the same function stack frame because of TCO. That is, any instructions in a basic block can execute function emulate by using the same function stack frame and save the overhead of creating function stack frame. --- Makefile | 3 ++- src/common.h | 10 ++++++++++ src/decode.h | 11 +++++++++++ src/emulate.c | 48 +++++++++++++++++++++++++----------------------- 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 4d87da712..66d0e2250 100644 --- a/Makefile +++ b/Makefile @@ -65,7 +65,8 @@ endif $(call set-feature, COMPUTED_GOTO) ifeq ($(call has, COMPUTED_GOTO), 1) ifeq ("$(CC_IS_GCC)", "1") -$(OUT)/emulate.o: CFLAGS += -fno-gcse -fno-crossjumping +# For tail-call elimination, we need a specific set of build flags applied. +$(OUT)/emulate.o: CFLAGS += -fno-gcse -fno-crossjumping -fomit-frame-pointer -fno-stack-check -fno-stack-protector endif endif diff --git a/src/common.h b/src/common.h index 348764125..e609cbb80 100644 --- a/src/common.h +++ b/src/common.h @@ -39,3 +39,13 @@ #define IIF_0(t, ...) __VA_ARGS__ /* run the 1st parameter */ #define IIF_1(t, ...) t + +/* There is no Tail-call optimization(TCO) in non-optimized builds. To work + * around this, we attempts to use a compiler attribute called musttail that + * forces the compiler to tail-call-optimize even when optimizations aren't on. + */ +#if defined(__has_attribute) && __has_attribute(musttail) +#define MUST_TAIL __attribute__((musttail)) +#else +#define MUST_TAIL +#endif \ No newline at end of file diff --git a/src/decode.h b/src/decode.h index 63f134944..34255e39e 100644 --- a/src/decode.h +++ b/src/decode.h @@ -240,6 +240,17 @@ typedef struct { /* instruction length */ uint8_t insn_len; + /* According to Tail-call optimization (TCO), if a C function ends with + * a function call to another function or itself and simply returns that + * function's result, the compiler can substitute a simple jump to the + * other function for the 'call' and 'return' instructions . The self + * -recursive function can therefore use the same function stack frame. + * + * Member tailcall help us to determine whether the IR is last IR in + * a basic block. We can use this member to rewrite the function emulate + * into a self-recursive version, allowing the compiler to do TCO. + */ + bool tailcall; } rv_insn_t; /* translated basic block */ diff --git a/src/emulate.c b/src/emulate.c index 995e3b6f1..d9ccbdcf0 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -256,6 +256,7 @@ static bool insn_is_misaligned(uint32_t pc) static bool emulate(riscv_t *rv, const rv_insn_t *ir) { + rv->X[rv_reg_zero] = 0; switch (ir->opcode) { /* RV32I Base Instruction Set */ case rv_insn_lui: /* LUI: Load Upper Immediate */ @@ -292,6 +293,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_jalr: { /* JALR: Jump and Link Register */ @@ -315,6 +317,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_beq: { /* BEQ: Branch if Equal */ @@ -328,6 +331,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -343,6 +347,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -358,6 +363,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -373,6 +379,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -388,6 +395,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -403,6 +411,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } break; @@ -553,10 +562,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_ecall: /* ECALL: Environment Call */ rv->compressed = false; rv->io.on_ecall(rv); + rv->csr_cycle++; return true; case rv_insn_ebreak: /* EBREAK: Environment Break */ rv->compressed = false; rv->io.on_ebreak(rv); + rv->csr_cycle++; return true; case rv_insn_wfi: /* WFI: Wait for Interrupt */ case rv_insn_uret: /* URET: return from traps in U-mode */ @@ -567,6 +578,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_mret: /* MRET: return from traps in U-mode */ rv->PC = rv->csr_mepc; /* this is a branch */ + rv->csr_cycle++; return true; #if RV32_HAS(Zifencei) @@ -986,6 +998,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cli: /* C.LI */ /* C.LI loads the sign-extended 6-bit immediate, imm, into register rd. @@ -1064,6 +1077,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cbeqz: /* C.BEQZ */ /* BEQZ performs conditional control transfers. The offset is @@ -1074,10 +1088,12 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) */ rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cbnez: /* C.BEQZ */ rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cslli: /* C.SLLI */ /* C.SLLI is a CI-format instruction that performs a logical left shift @@ -1100,6 +1116,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) case rv_insn_cjr: /* C.JR */ rv->PC = rv->X[ir->rs1]; /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cmv: /* C.MV */ rv->X[ir->rd] = rv->X[ir->rs2]; @@ -1108,6 +1125,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) rv->compressed = true; rv->io.on_ebreak(rv); /* can branch */ + rv->csr_cycle++; return true; case rv_insn_cjalr: { /* C.JALR */ /* Unconditional jump and store PC+2 to ra */ @@ -1120,6 +1138,7 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) return false; } /* can branch */ + rv->csr_cycle++; return true; } case rv_insn_cadd: /* C.ADD */ @@ -1147,7 +1166,10 @@ static bool emulate(riscv_t *rv, const rv_insn_t *ir) /* step over instruction */ rv->PC += ir->insn_len; - return true; + rv->csr_cycle++; + if (ir->tailcall) + return true; + MUST_TAIL return emulate(rv, ir + 1); } static bool insn_is_branch(uint8_t opcode) @@ -1240,27 +1262,6 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr) return NULL; } -/* execute a basic block */ -static bool block_emulate(riscv_t *rv, const block_t *block) -{ - const uint32_t n_insn = block->n_insn; - const rv_insn_t *ir = block->ir; - - /* execute the block */ - for (uint32_t i = 0; i < n_insn; i++) { - /* enforce zero register */ - rv->X[rv_reg_zero] = 0; - - /* execute the instruction */ - if (!emulate(rv, ir + i)) - return false; - - /* increment the cycles csr */ - rv->csr_cycle++; - } - return true; -} - static void block_translate(riscv_t *rv, block_t *block) { block->pc_start = block->pc_end = rv->PC; @@ -1288,6 +1289,7 @@ static void block_translate(riscv_t *rv, block_t *block) if (insn_is_branch(ir->opcode)) break; } + block->ir[block->n_insn - 1].tailcall = true; } static block_t *block_find_or_translate(riscv_t *rv, block_t *prev) @@ -1350,7 +1352,7 @@ void rv_step(riscv_t *rv, int32_t cycles) assert(block); /* execute the block */ - if (!block_emulate(rv, block)) + if (!emulate(rv, block->ir)) break; prev = block;