Skip to content

Commit

Permalink
Merge pull request #4161 from bylaws/tf
Browse files Browse the repository at this point in the history
FEXCore: Emulate EFLAGS.TF
  • Loading branch information
Sonicadvance1 authored Dec 12, 2024
2 parents b03b02d + 8d32041 commit e88c92d
Show file tree
Hide file tree
Showing 20 changed files with 368 additions and 112 deletions.
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/Context/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ class ContextImpl final : public FEXCore::Context::Context {
[[nodiscard]]
CompileCodeResult CompileCode(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst = 0);
uintptr_t CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst = 0);
uintptr_t CompileSingleStep(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP);

IR::OpSize GetGPROpSize() const {
return Config.Is64BitMode ? IR::OpSize::i64Bit : IR::OpSize::i32Bit;
Expand Down
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/CPUBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,13 @@ namespace CPU {
*
* @param IR - IR that maps to the IR for this RIP
* @param DebugData - Debug data that is available for this IR indirectly
* @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block
*
* @return Information about the compiled code block.
*/
[[nodiscard]]
virtual CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData,
const FEXCore::IR::RegisterAllocationData* RAData) = 0;
const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0;

/**
* @brief Relocates a block of code from the JIT code object cache
Expand Down
28 changes: 27 additions & 1 deletion FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadSt
case X86State::RFLAG_CF_RAW_LOC:
case X86State::RFLAG_PF_RAW_LOC:
case X86State::RFLAG_AF_RAW_LOC:
case X86State::RFLAG_TF_RAW_LOC:
case X86State::RFLAG_ZF_RAW_LOC:
case X86State::RFLAG_SF_RAW_LOC:
case X86State::RFLAG_OF_RAW_LOC:
Expand Down Expand Up @@ -213,6 +214,9 @@ uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadSt
uint32_t AF = ((Frame->State.af_raw ^ PFByte) & (1 << 4)) ? 1 : 0;
EFLAGS |= AF << X86State::RFLAG_AF_RAW_LOC;

uint8_t TFByte = Frame->State.flags[X86State::RFLAG_TF_RAW_LOC];
EFLAGS |= (TFByte & 1) << X86State::RFLAG_TF_RAW_LOC;

// DF is pretransformed, undo the transform from 1/-1 back to 0/1
uint8_t DFByte = Frame->State.flags[X86State::RFLAG_DF_RAW_LOC];
if (DFByte & 0x80) {
Expand Down Expand Up @@ -772,13 +776,17 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
if (!IR) {
return {};
}

// If the trap flag is set we generate single instruction blocks that each check to generate a single step exception.
bool TFSet = Thread->CurrentFrame->State.flags[X86State::RFLAG_TF_RAW_LOC];

// Attempt to get the CPU backend to compile this code
auto IRView = IR->GetIRView();
return {
// FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint
// In the future with code caching getting wired up, we will pass the rest of the data forward.
// TODO: Pass the data forward when code caching is wired up to this.
.CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, &IRView, DebugData, IR->RAData()).BlockEntry,
.CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry,
.IR = std::move(IR),
.DebugData = DebugData,
.GeneratedIR = true,
Expand Down Expand Up @@ -863,6 +871,24 @@ uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_
return (uintptr_t)CodePtr;
}

uintptr_t ContextImpl::CompileSingleStep(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP) {
FEXCORE_PROFILE_SCOPED("CompileSingleStep");
auto Thread = Frame->Thread;

// Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled
auto lk = GuardSignalDeferringSection<std::shared_lock>(CodeInvalidationMutex, Thread);

auto [CodePtr, IR, DebugData, GeneratedIR, StartAddr, Length] = CompileCode(Thread, GuestRIP, 1);
if (CodePtr == nullptr) {
return 0;
}

// Clear any relocations that might have been generated
Thread->CPUBackend->ClearRelocations();

return (uintptr_t)CodePtr;
}

static void InvalidateGuestThreadCodeRange(FEXCore::Core::InternalThreadState* Thread, uint64_t Start, uint64_t Length) {
std::lock_guard<std::recursive_mutex> lk(Thread->LookupCache->WriteLock);

Expand Down
180 changes: 110 additions & 70 deletions FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Dispatcher::~Dispatcher() {
}

void Dispatcher::EmitDispatcher() {
// Don't modify TMP3 since it contains our RIP once the block doesn't exist
auto RipReg = TMP3;
#ifdef VIXL_DISASSEMBLER
const auto DisasmBegin = GetCursorAddress<const vixl::aarch64::Instruction*>();
#endif
Expand All @@ -62,7 +64,8 @@ void Dispatcher::EmitDispatcher() {

ARMEmitter::ForwardLabel l_CTX;
ARMEmitter::SingleUseForwardLabel l_Sleep;
ARMEmitter::SingleUseForwardLabel l_CompileBlock;
ARMEmitter::ForwardLabel l_CompileBlock;
ARMEmitter::ForwardLabel l_CompileSingleStep;

// Push all the register we need to save
PushCalleeSavedRegisters();
Expand All @@ -81,6 +84,7 @@ void Dispatcher::EmitDispatcher() {

FillStaticRegs();
ARMEmitter::BiDirectionalLabel LoopTop {};
ARMEmitter::ForwardLabel CompileSingleStep;

#ifdef _M_ARM_64EC
b(&LoopTop);
Expand Down Expand Up @@ -116,10 +120,11 @@ void Dispatcher::EmitDispatcher() {
AbsoluteLoopTopAddress = GetCursorAddress<uint64_t>();

// Load in our RIP
// Don't modify TMP3 since it contains our RIP once the block doesn't exist
auto RipReg = TMP3;
ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip));

ldrb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC]));
cbnz(ARMEmitter::Size::i32Bit, TMP1, &CompileSingleStep);

// L1 Cache
ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.L1Pointer));

Expand Down Expand Up @@ -204,37 +209,21 @@ void Dispatcher::EmitDispatcher() {
ret();
}

{
ExitFunctionLinkerAddress = GetCursorAddress<uint64_t>();
SpillStaticRegs(TMP1);

// Clobbers TMP1/2
auto EmitSignalGuardedRegion = [&](auto Body) {
#ifndef _WIN32
ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1);
str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1);
str(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
#endif

#ifdef _M_ARM_64EC
ldr(ARMEmitter::XReg::x0, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET);
LoadConstant(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, 1);
strb(ARMEmitter::WReg::w1, ARMEmitter::XReg::x0, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET);
ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET);
LoadConstant(ARMEmitter::Size::i32Bit, TMP1, 1);
strb(TMP1.W(), TMP2, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET);
#endif

mov(ARMEmitter::XReg::x0, STATE);
mov(ARMEmitter::XReg::x1, ARMEmitter::XReg::lr);

ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.ExitFunctionLink));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uintptr_t, void*, void*>(ARMEmitter::Reg::r2);
} else {
blr(ARMEmitter::Reg::r2);
}

if (!TMP_ABIARGS) {
mov(TMP1, ARMEmitter::XReg::x0);
}

FillStaticRegs();
Body();

#ifdef _M_ARM_64EC
ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET);
Expand All @@ -250,15 +239,36 @@ void Dispatcher::EmitDispatcher() {
strb(ARMEmitter::XReg::zr, STATE,
offsetof(FEXCore::Core::InternalThreadState, InterruptFaultPage) - offsetof(FEXCore::Core::InternalThreadState, BaseFrameState));
#endif
};

{
ExitFunctionLinkerAddress = GetCursorAddress<uint64_t>();
EmitSignalGuardedRegion([&]() {
SpillStaticRegs(TMP1);

mov(ARMEmitter::XReg::x0, STATE);
mov(ARMEmitter::XReg::x1, ARMEmitter::XReg::lr);

ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.ExitFunctionLink));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uintptr_t, void*, void*>(ARMEmitter::Reg::r2);
} else {
blr(ARMEmitter::Reg::r2);
}

if (!TMP_ABIARGS) {
mov(TMP1, ARMEmitter::XReg::x0);
}

FillStaticRegs();
});

br(TMP1);
}

// Need to create the block
{
Bind(&NoBlock);

#ifdef _M_ARM_64EC
// Clobbers TMP1/2
auto EmitECExitCheck = [&]() {
// Check the EC code bitmap incase we need to exit the JIT to call into native code.
ARMEmitter::SingleUseForwardLabel l_NotECCode;
ldr(TMP1, ARMEmitter::XReg::x18, TEB_PEB_OFFSET);
Expand All @@ -277,56 +287,83 @@ void Dispatcher::EmitDispatcher() {
br(TMP2);

Bind(&l_NotECCode);
};
#endif

SpillStaticRegs(TMP1);

if (!TMP_ABIARGS) {
mov(ARMEmitter::XReg::x2, RipReg);
}

#ifndef _WIN32
ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1);
str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
#endif
// Need to create the block
{
Bind(&NoBlock);

#ifdef _M_ARM_64EC
ldr(ARMEmitter::XReg::x0, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET);
LoadConstant(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, 1);
strb(ARMEmitter::WReg::w1, ARMEmitter::XReg::x0, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET);
EmitECExitCheck();
#endif

ldr(ARMEmitter::XReg::x0, &l_CTX);
mov(ARMEmitter::XReg::x1, STATE);
// x2 contains guest RIP
mov(ARMEmitter::XReg::x3, 0);
ldr(ARMEmitter::XReg::x4, &l_CompileBlock);
EmitSignalGuardedRegion([&]() {
SpillStaticRegs(TMP1);

if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uintptr_t, void*, void*, uint64_t, uint64_t>(ARMEmitter::Reg::r4);
} else {
blr(ARMEmitter::Reg::r4); // { CTX, Frame, RIP, MaxInst }
}
if (!TMP_ABIARGS) {
mov(ARMEmitter::XReg::x2, RipReg);
}

FillStaticRegs();
ldr(ARMEmitter::XReg::x0, &l_CTX);
mov(ARMEmitter::XReg::x1, STATE);
// x2 contains guest RIP
mov(ARMEmitter::XReg::x3, 0);
ldr(ARMEmitter::XReg::x4, &l_CompileBlock);

if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uintptr_t, void*, void*, uint64_t, uint64_t>(ARMEmitter::Reg::r4);
} else {
blr(ARMEmitter::Reg::r4); // { CTX, Frame, RIP, MaxInst }
}

// Result is now in x0
if (!TMP_ABIARGS) {
mov(TMP1, ARMEmitter::XReg::x0);
}

FillStaticRegs();
});

// Jump to the compiled block
br(TMP1);
}

{
Bind(&CompileSingleStep);

#ifdef _M_ARM_64EC
ldr(TMP1, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET);
strb(ARMEmitter::WReg::zr, TMP1, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET);
EmitECExitCheck();
#endif

#ifndef _WIN32
ldr(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1);
str(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount));
EmitSignalGuardedRegion([&]() {
SpillStaticRegs(TMP1);

// Trigger segfault if any deferred signals are pending
strb(ARMEmitter::XReg::zr, STATE,
offsetof(FEXCore::Core::InternalThreadState, InterruptFaultPage) - offsetof(FEXCore::Core::InternalThreadState, BaseFrameState));
#endif
if (!TMP_ABIARGS) {
mov(ARMEmitter::XReg::x2, RipReg);
}

b(&LoopTop);
ldr(ARMEmitter::XReg::x0, &l_CTX);
mov(ARMEmitter::XReg::x1, STATE);
// x2 contains guest RIP
ldr(ARMEmitter::XReg::x4, &l_CompileSingleStep);

if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uintptr_t, void*, void*, uint64_t, uint64_t>(ARMEmitter::Reg::r4);
} else {
blr(ARMEmitter::Reg::r4); // { CTX, Frame, RIP }
}

// Result is now in x0
if (!TMP_ABIARGS) {
mov(TMP1, ARMEmitter::XReg::x0);
}

FillStaticRegs();
});

// Jump to the compiled block
br(TMP1);
}

{
Expand Down Expand Up @@ -505,8 +542,11 @@ void Dispatcher::EmitDispatcher() {
Bind(&l_Sleep);
dc64(reinterpret_cast<uint64_t>(SleepThread));
Bind(&l_CompileBlock);
FEXCore::Utils::MemberFunctionToPointerCast PMF(&FEXCore::Context::ContextImpl::CompileBlock);
dc64(PMF.GetConvertedPointer());
FEXCore::Utils::MemberFunctionToPointerCast PMFCompileBlock(&FEXCore::Context::ContextImpl::CompileBlock);
dc64(PMFCompileBlock.GetConvertedPointer());
Bind(&l_CompileSingleStep);
FEXCore::Utils::MemberFunctionToPointerCast PMFCompileSingleStep(&FEXCore::Context::ContextImpl::CompileSingleStep);
dc64(PMFCompileSingleStep.GetConvertedPointer());

Start = reinterpret_cast<uint64_t>(DispatchPtr);
End = GetCursorAddress<uint64_t>();
Expand Down
Loading

0 comments on commit e88c92d

Please sign in to comment.