Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement loop peeling. #1510

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions tests/c/nested_sidetrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@
// 18
// yk-jit-event: enter-jit-code
// yk-jit-event: deoptimise
// yk-jit-event: start-side-tracing
// 20
// yk-jit-event: enter-jit-code
// yk-jit-event: deoptimise
// yk-jit-event: start-side-tracing
// 22
// yk-jit-event: stop-tracing
// --- Begin jit-pre-opt ---
// ...
// --- End jit-pre-opt ---
// 22
// yk-jit-event: enter-jit-code
// yk-jit-event: execute-side-trace
// 24
// yk-jit-event: enter-jit-code
// yk-jit-event: execute-side-trace
// 26
// yk-jit-event: execute-side-trace
Expand Down
2 changes: 1 addition & 1 deletion tests/c/side-trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ int foo(int i) {
int main(int argc, char **argv) {
YkMT *mt = yk_mt_new(NULL);
yk_mt_hot_threshold_set(mt, 0);
yk_mt_sidetrace_threshold_set(mt, 5);
yk_mt_sidetrace_threshold_set(mt, 4);
YkLocation loc = yk_location_new();

int res = 0;
Expand Down
2 changes: 1 addition & 1 deletion tests/c/simple.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Run-time:
// env-var: YKD_LOG_IR=aot,jit-pre-opt
// env-var: YKD_LOG_IR=aot,jit-pre-opt,jit-post-opt
// env-var: YKD_SERIALISE_COMPILATION=1
// env-var: YK_LOG=4
// stderr:
Expand Down
23 changes: 21 additions & 2 deletions ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,22 @@ impl<'a> LSRegAlloc<'a> {
}
}

pub(crate) fn reset(&mut self) {
for rs in self.gp_reg_states.iter_mut() {
*rs = RegState::Empty;
}
for reg in RESERVED_GP_REGS {
self.gp_reg_states[usize::from(reg.code())] = RegState::Reserved;
}

for rs in self.fp_reg_states.iter_mut() {
*rs = RegState::Empty;
}
for reg in RESERVED_FP_REGS {
self.fp_reg_states[usize::from(reg.code())] = RegState::Reserved;
}
}

/// Before generating code for the instruction at `iidx`, see which registers are no longer
/// needed and mark them as [RegState::Empty]. Calling this allows the register allocator to
/// use the set of available registers more efficiently.
Expand Down Expand Up @@ -891,14 +907,17 @@ impl LSRegAlloc<'_> {
match inst {
Inst::Copy(_) => panic!(),
Inst::Const(cidx) => match self.m.const_(cidx) {
Const::Float(_, _) => todo!(),
Const::Float(_, v) => VarLocation::ConstFloat(*v),
Const::Int(tyidx, v) => {
let Ty::Integer(bits) = self.m.type_(*tyidx) else {
panic!()
};
VarLocation::ConstInt { bits: *bits, v: *v }
}
Const::Ptr(_) => todo!(),
Const::Ptr(p) => VarLocation::ConstInt {
bits: 64,
v: u64::try_from(*p).unwrap(),
},
},
_ => match self.spills[usize::from(iidx)] {
SpillState::Empty => panic!(),
Expand Down
148 changes: 112 additions & 36 deletions ykrt/src/compile/jitc_yk/codegen/x64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ impl X64CodeGen {
struct Assemble<'a> {
m: &'a jit_ir::Module,
ra: LSRegAlloc<'a>,
/// The locations of the live variables at the beginning of the loop.
/// The locations of the live variables at the begining of the trace header.
trace_reentry_locs: Vec<VarLocation>,
/// The locations of the live variables at the beginning of the trace body.
loop_start_locs: Vec<VarLocation>,
asm: dynasmrt::x64::Assembler,
/// Deopt info, with one entry per guard, in the order that the guards appear in the trace.
Expand All @@ -202,6 +204,12 @@ struct Assemble<'a> {
/// layer of dead-code elimination: it doesn't cause JIT IR instructions to be removed, but
/// it will stop any code being (directly) generated for some of them.
used_insts: Vob,
/// The offset after the trace's prologue. This is the re-entry point when returning from
/// side-traces.
prologue_offset: AssemblyOffset,
/// Whether or not to skip processing [Param]s. We enable this once we've finished processing
/// the header, as we currently [Param]s in the trace body are only placeholders.
skip_params: bool,
}

impl<'a> Assemble<'a> {
Expand Down Expand Up @@ -255,13 +263,16 @@ impl<'a> Assemble<'a> {
m,
ra: LSRegAlloc::new(m, inst_vals_alive_until, vloc_hints, sp_offset),
asm,
trace_reentry_locs: Vec::new(),
loop_start_locs: Vec::new(),
deoptinfo: HashMap::new(),
comments: Cell::new(IndexMap::new()),
sp_offset,
root_offset,
used_insts,
ptradds,
prologue_offset: AssemblyOffset(0),
skip_params: false,
}))
}

Expand Down Expand Up @@ -290,10 +301,6 @@ impl<'a> Assemble<'a> {
}

let alloc_off = self.emit_prologue();
// The instruction offset after we've emitted the prologue (i.e. updated the stack
// pointer). We will later adjust this offset to also include one iteration of the trace
// so we can jump directly to the peeled loop.
let prologue_offset = self.asm.offset();

self.cg_insts()?;

Expand Down Expand Up @@ -452,8 +459,8 @@ impl<'a> Assemble<'a> {
deoptinfo: self.deoptinfo,
prevguards,
sp_offset: self.ra.stack_size(),
prologue_offset: prologue_offset.0,
entry_vars: self.loop_start_locs.clone(),
prologue_offset: self.prologue_offset.0,
entry_vars: self.trace_reentry_locs.clone(),
hl: Arc::downgrade(&hl),
comments: self.comments.take(),
#[cfg(any(debug_assertions, test))]
Expand Down Expand Up @@ -506,8 +513,10 @@ impl<'a> Assemble<'a> {
continue;
}
jit_ir::Inst::Guard(i) => self.cg_guard(iidx, i),
jit_ir::Inst::TraceLoopStart => self.cg_traceloopstart(),
jit_ir::Inst::TraceLoopJump => self.cg_traceloopjump(),
jit_ir::Inst::TraceHeaderStart => self.cg_tracereentrypoint(),
jit_ir::Inst::TraceHeaderEnd => self.cg_thead_jump(),
jit_ir::Inst::TraceBodyStart => self.cg_traceloopstart(),
jit_ir::Inst::TraceBodyEnd => self.cg_traceloopjump(),
jit_ir::Inst::RootJump => self.cg_rootjump(self.m.root_jump_addr()),
jit_ir::Inst::SExt(i) => self.cg_sext(iidx, i),
jit_ir::Inst::ZExt(i) => self.cg_zext(iidx, i),
Expand Down Expand Up @@ -1062,6 +1071,9 @@ impl<'a> Assemble<'a> {
/// Codegen a [jit_ir::ParamInst]. This only informs the register allocator about the
/// locations of live variables without generating any actual machine code.
fn cg_param(&mut self, iidx: jit_ir::InstIdx, inst: &jit_ir::ParamInst) {
if self.skip_params {
return;
}
let m = VarLocation::from_yksmp_location(self.m, iidx, self.m.param(inst.paramidx()));
debug_assert!(self.m.inst(iidx).def_byte_size(self.m) <= REG64_BYTESIZE);
match m {
Expand Down Expand Up @@ -1777,16 +1789,10 @@ impl<'a> Assemble<'a> {
// If we pass in `None` use `self.loop_start_locs` instead. We need to do this since we
// can't pass in `&self.loop_start_locs` directly due to borrowing restrictions.
let tgt_vars = tgt_vars.unwrap_or(self.loop_start_locs.as_slice());
for (i, op) in self.m.loop_jump_operands().iter().enumerate() {
for (i, op) in self.m.trace_body_end().iter().enumerate() {
// FIXME: This is completely broken: see the FIXME later.
let op = op.unpack(self.m);
let (iidx, src) = match op {
Operand::Var(iidx) => (iidx, self.op_to_var_location(op.clone())),
Operand::Const(_) => (
InstIdx::try_from(0).unwrap(),
self.op_to_var_location(op.clone()),
),
};
let src = self.op_to_var_location(op.clone());
let dst = tgt_vars[i];
if dst == src {
// The value is already in the correct place, so there's nothing we need to
Expand All @@ -1803,6 +1809,9 @@ impl<'a> Assemble<'a> {
8 => dynasm!(self.asm;
mov QWORD [rbp - i32::try_from(off_dst).unwrap()], Rq(reg.code())
),
4 => dynasm!(self.asm;
mov DWORD [rbp - i32::try_from(off_dst).unwrap()], Rd(reg.code())
),
_ => todo!(),
},
VarLocation::ConstInt { bits, v } => match bits {
Expand Down Expand Up @@ -1844,14 +1853,18 @@ impl<'a> Assemble<'a> {
reg_alloc::Register::GP(r) => {
let [_] = self.ra.assign_gp_regs(
&mut self.asm,
iidx,
// This tells the register allocator to assign this register
// to the instruction at `iidx` so we can look it up later. Since
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "since we are at the end" bit is dangerous: the register allocator does (must!) take into account iidx when making decisions about what to spill etc. By passing in 0 here, I think weird things will happen. Is there any reason not to pass iidx here (and below)?

// we are at the end of the trace we won't ever need to look this up
// and the index is irrelevant. We can just pass anything here.
InstIdx::unchecked_from(0),
[RegConstraint::InputIntoReg(op.clone(), r)],
);
}
reg_alloc::Register::FP(r) => {
let [_] = self.ra.assign_fp_regs(
&mut self.asm,
iidx,
InstIdx::unchecked_from(0),
[RegConstraint::InputIntoReg(op.clone(), r)],
);
}
Expand Down Expand Up @@ -1906,19 +1919,81 @@ impl<'a> Assemble<'a> {
; jmp rdi);
}

fn cg_traceloopstart(&mut self) {
fn cg_tracereentrypoint(&mut self) {
debug_assert_eq!(self.loop_start_locs.len(), 0);
// Remember the locations of the live variables at the beginning of the trace. When we loop
// back around here we need to write the live variables back into these same locations.
for var in self.m.loop_start_vars() {
let loc = match var {
Operand::Var(iidx) => self.ra.var_location(*iidx),
// Remember the locations of the live variables at the beginning of the trace. When we
// re-enter the trace from a side-trace, we need to write the live variables back into
// these same locations.
for var in self.m.trace_header_start() {
let loc = match var.unpack(self.m) {
Operand::Var(iidx) => self.ra.var_location(iidx),
_ => panic!(),
};
self.trace_reentry_locs.push(loc);
}
dynasm!(self.asm; ->reentry:);
self.prologue_offset = self.asm.offset();
}

fn cg_thead_jump(&mut self) {
// FIXME: This is a bit of a roundabout way of doing things. Especially, since it means
// that the [ParamInst]s in the trace body are just placeholders. While, since a recent
// change, the register allocator makes sure the values automatically end up in the
// [VarLocation]s expected by the loop start, this only works for registers right now. We
// can extend this to spill locations as well, but won't be able to do so for variables
// that have become constants during the trace header. So we will always have to either
// update the [ParamInst]s of the trace body, which isn't ideal since it requires the
// [Module] the be mutable. Or we do what we do below just for constants.
let mut varlocs = Vec::new();
for var in self.m.trace_header_end().iter() {
let varloc = self.op_to_var_location(var.unpack(self.m));
varlocs.push(varloc);
}
// Reset the register allocator before priming it with information about the trace body
// inputs.
self.ra.reset();
for (i, op) in self.m.trace_body_start().iter().enumerate() {
// By definition these can only be variables.
let iidx = match op.unpack(self.m) {
Operand::Var(iidx) => iidx,
_ => panic!(),
};
let varloc = varlocs[i];

// Write the varlocations from the head jump to the body start.
// FIXME: This is copied verbatim from `cg_param` and can be reused.
match varloc {
VarLocation::Register(reg_alloc::Register::GP(reg)) => {
self.ra.force_assign_inst_gp_reg(&mut self.asm, iidx, reg);
}
VarLocation::Register(reg_alloc::Register::FP(reg)) => {
self.ra.force_assign_inst_fp_reg(iidx, reg);
}
VarLocation::Direct { frame_off, size: _ } => {
self.ra.force_assign_inst_direct(iidx, frame_off);
}
VarLocation::Stack { frame_off, size: _ } => {
self.ra
.force_assign_inst_indirect(iidx, i32::try_from(frame_off).unwrap());
}
VarLocation::ConstInt { bits, v } => {
self.ra.assign_const(iidx, bits, v);
}
e => panic!("{:?}", e),
}
}
self.skip_params = true;
}

fn cg_traceloopstart(&mut self) {
debug_assert_eq!(self.loop_start_locs.len(), 0);
// Remember the locations of the live variables at the beginning of the trace loop. When we
// loop back around here we need to write the live variables back into these same
// locations.
for var in self.m.trace_body_start() {
let loc = self.op_to_var_location(var.unpack(&self.m));
self.loop_start_locs.push(loc);
}
// FIXME: peel the initial iteration of the loop to allow us to hoist loop invariants.
// When doing so, update the jump target inside side-traces.
dynasm!(self.asm; ->tloop_start:);
}

Expand Down Expand Up @@ -2290,9 +2365,10 @@ impl CompiledTrace for X64CompiledTrace {
.collect();
let callframes = deoptinfo.inlined_frames.clone();

// Calculate the address inside the root trace we want side-traces to jump to. Currently
// this is directly after the prologue. Later we will change this to jump to after the
// preamble and before the peeled loop.
// Calculate the address inside the root trace we want side-traces to jump. Since the
// side-trace finishes at the control point we need to re-enter via the trace header and
// cannot jump back directly into the trace body.
// FIXME: Check if RPython has found a solution to this (if there is any).
let root_addr = unsafe { root_ctr.entry().add(root_ctr.prologue_offset) };

// Pass along [GuardIdx]'s of previous guard failures and add this guard failure's
Expand Down Expand Up @@ -3892,8 +3968,8 @@ mod tests {
",
"
...
; tloop_start []:
; tloop_jump []:
; tloop_start []
; tloop_jump []
jmp {{target}}
",
false,
Expand All @@ -3915,11 +3991,11 @@ mod tests {
...
; %0: i8 = param ...
...
; tloop_start [%0]:
; tloop_start [%0]
; %2: i8 = add %0, %0
{{_}} {{off}}: ...
...
; tloop_jump [%0]:
; tloop_jump [%0]
...
{{_}} {{_}}: jmp 0x00000000{{off}}
",
Expand Down Expand Up @@ -4420,8 +4496,8 @@ mod tests {
...
; %0: i8 = param ...
...
; tloop_start [%0]:
; tloop_jump [42i8]:
; tloop_start [%0]
; tloop_jump [42i8]
mov r.64.x, 0x2a
jmp ...
",
Expand Down
6 changes: 3 additions & 3 deletions ykrt/src/compile/jitc_yk/codegen/x64/rev_analyse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl<'a> RevAnalyse<'a> {
continue;
}
}
Inst::TraceLoopJump => {
Inst::TraceBodyEnd => {
self.an_tloop_jump();
}
Inst::SExt(x) => self.an_sext(iidx, x),
Expand Down Expand Up @@ -191,9 +191,9 @@ impl<'a> RevAnalyse<'a> {
}
}

debug_assert_eq!(param_vlocs.len(), self.m.loop_jump_operands().len());
debug_assert_eq!(param_vlocs.len(), self.m.trace_body_end().len());

for (param_vloc, jump_op) in param_vlocs.into_iter().zip(self.m.loop_jump_operands()) {
for (param_vloc, jump_op) in param_vlocs.into_iter().zip(self.m.trace_body_end()) {
if let Operand::Var(op_iidx) = jump_op.unpack(self.m) {
self.vloc_hints[usize::from(op_iidx)] = Some(param_vloc);
}
Expand Down
Loading
Loading