diff --git a/FEXCore/Source/Interface/Context/Context.h b/FEXCore/Source/Interface/Context/Context.h index b90375a92a..0402b3642c 100644 --- a/FEXCore/Source/Interface/Context/Context.h +++ b/FEXCore/Source/Interface/Context/Context.h @@ -88,6 +88,9 @@ class ContextImpl final : public FEXCore::Context::Context { void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) override; + bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) override; + bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) override; + uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) override; uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) override; void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) override; diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index 2348d9cd6a..b38df6ed0d 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -76,6 +76,9 @@ constexpr size_t CPU_AREA_EMULATOR_STACK_BASE_OFFSET = 0x8; constexpr size_t CPU_AREA_EMULATOR_DATA_OFFSET = 0x30; #endif +// Will force one single instruction block to be generated first if set when entering the JIT filling SRA. +constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1; + // Predicate register temporaries (used when AVX support is enabled) // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1. // PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1. diff --git a/FEXCore/Source/Interface/Core/CPUBackend.h b/FEXCore/Source/Interface/Core/CPUBackend.h index 188851cd3c..c7265dc5a4 100644 --- a/FEXCore/Source/Interface/Core/CPUBackend.h +++ b/FEXCore/Source/Interface/Core/CPUBackend.h @@ -80,9 +80,16 @@ namespace CPU { struct JITCodeTail { // The total size of the codeblock from [BlockBegin, BlockBegin+Size). size_t Size; + // RIP that the block's entry comes from. uint64_t RIP; + // The length of the guest code for this block. + size_t GuestSize; + + // If this block represents a single guest instruction. + bool SingleInst; + // Number of RIP entries for this JIT Code section. uint32_t NumberOfRIPEntries; @@ -119,6 +126,8 @@ namespace CPU { * * This is a thread specific compilation unit since there is one CPUBackend per guest thread * + * @param Size - The byte size of the guest code for this block + * @param SingleInst - If this block represents a single guest instruction * @param IR - IR that maps to the IR for this RIP * @param DebugData - Debug data that is available for this IR indirectly * @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block @@ -126,8 +135,8 @@ namespace CPU { * @return Information about the compiled code block. */ [[nodiscard]] - virtual CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0; + virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, + FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0; /** * @brief Relocates a block of code from the JIT code object cache diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index 9e965546e3..9277a766c7 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -112,13 +112,38 @@ ContextImpl::~ContextImpl() { } } -uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { - const auto Frame = Thread->CurrentFrame; +struct GetFrameBlockInfoResult { + const CPU::CPUBackend::JITCodeHeader* InlineHeader; + const CPU::CPUBackend::JITCodeTail* InlineTail; +}; +static GetFrameBlockInfoResult GetFrameBlockInfo(FEXCore::Core::CpuStateFrame* Frame) { const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; auto InlineHeader = reinterpret_cast(BlockBegin); if (InlineHeader) { auto InlineTail = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail); + return {InlineHeader, InlineTail}; + } + + return {InlineHeader, nullptr}; +} + +bool ContextImpl::IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) { + auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + return InlineTail && (Address + Size > InlineTail->RIP && Address < InlineTail->RIP + InlineTail->GuestSize); +} + +bool ContextImpl::IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) { + auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + return InlineTail && InlineTail->SingleInst; +} + +uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { + const auto Frame = Thread->CurrentFrame; + const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; + auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + + if (InlineHeader) { auto RIPEntries = reinterpret_cast( Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries); @@ -555,6 +580,7 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue GuestCode = reinterpret_cast(GuestRIP); bool HadDispatchError {false}; + bool HadInvalidInst {false}; Thread->FrontendDecoder->DecodeInstructionsAtEntry(GuestCode, GuestRIP, MaxInst, [Thread](uint64_t BlockEntry, uint64_t Start, uint64_t Length) { @@ -652,16 +678,23 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue ++TotalInstructions; } } else { - if (TableInfo) { - LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP); - } // Invalid instruction - Thread->OpDispatcher->InvalidOp(DecodedInfo); - Thread->OpDispatcher->ExitFunction(Thread->OpDispatcher->_EntrypointOffset(GPRSize, Block.Entry - GuestRIP)); + if (!BlockInstructionsLength) { + // SMC can modify block contents and patch invalid instructions to valid ones inline. + // End blocks upon encountering them and only emit an invalid opcode exception if there are no prior instructions in the block (that could have modified it to be valid). + + if (TableInfo) { + LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP); + } + + Thread->OpDispatcher->InvalidOp(DecodedInfo); + } + + HadInvalidInst = true; } - const bool NeedsBlockEnd = - (HadDispatchError && TotalInstructions > 0) || (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock); + const bool NeedsBlockEnd = (HadDispatchError && TotalInstructions > 0) || + (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock) || HadInvalidInst; // If we had a dispatch error then leave early if (HadDispatchError && TotalInstructions == 0) { @@ -747,6 +780,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT fextl::unique_ptr IR; FEXCore::Core::DebugData* DebugData {}; + uint64_t TotalInstructions {}; uint64_t StartAddr {}; uint64_t Length {}; @@ -764,11 +798,12 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT if (!IR) { // Generate IR + Meta Info - auto [IRCopy, TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst); + auto [IRCopy, _TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst); // Setup pointers to internal structures IR = std::move(IRCopy); DebugData = new FEXCore::Core::DebugData(); + TotalInstructions = _TotalInstructions; StartAddr = _StartAddr; Length = _Length; } @@ -786,7 +821,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT // FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint // In the future with code caching getting wired up, we will pass the rest of the data forward. // TODO: Pass the data forward when code caching is wired up to this. - .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, + .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, TotalInstructions == 1, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, .IR = std::move(IR), .DebugData = DebugData, .GeneratedIR = true, diff --git a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp index 4157d20a25..2e0d2bfb2a 100644 --- a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp +++ b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp @@ -93,6 +93,10 @@ void Dispatcher::EmitDispatcher() { ldr(STATE, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_DATA_OFFSET); FillStaticRegs(); + ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); + // Force a single instruction block if ENTRY_FILL_SRA_SINGLE_INST_REG is nonzero entering the JIT, used for inline SMC handling. + cbnz(ARMEmitter::Size::i32Bit, ENTRY_FILL_SRA_SINGLE_INST_REG, &CompileSingleStep); + // Enter JIT b(&LoopTop); diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp index 03fe12c674..5f5d7580e2 100644 --- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp @@ -720,8 +720,9 @@ void Arm64JITCore::EmitInterruptChecks(bool CheckTF) { #endif } -CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) { +CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, + FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, + bool CheckTF) { FEXCORE_PROFILE_SCOPED("Arm64::CompileCode"); JumpTargets.clear(); @@ -861,6 +862,8 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore // TODO: This needs to be a data RIP relocation once code caching works. // Current relocation code doesn't support this feature yet. JITBlockTail->RIP = Entry; + JITBlockTail->GuestSize = Size; + JITBlockTail->SingleInst = SingleInst; JITBlockTail->SpinLockFutex = 0; { diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h index 0c3dbbf8ff..f1f4477aa9 100644 --- a/FEXCore/Source/Interface/Core/JIT/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/JITClass.h @@ -38,8 +38,9 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { ~Arm64JITCore() override; [[nodiscard]] - CPUBackend::CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override; + CPUBackend::CompiledCode + CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, + const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override; void ClearCache() override; diff --git a/FEXCore/include/FEXCore/Core/Context.h b/FEXCore/include/FEXCore/Core/Context.h index c6859a004d..e807107018 100644 --- a/FEXCore/include/FEXCore/Core/Context.h +++ b/FEXCore/include/FEXCore/Core/Context.h @@ -104,6 +104,9 @@ class Context { FEX_DEFAULT_VISIBILITY virtual void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) = 0; + FEX_DEFAULT_VISIBILITY virtual bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) = 0; + FEX_DEFAULT_VISIBILITY virtual bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) = 0; + ///< State reconstruction helpers ///< Reconstructs the guest RIP from the passed in thread context and related Host PC. FEX_DEFAULT_VISIBILITY virtual uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) = 0; diff --git a/Source/Windows/ARM64EC/Module.S b/Source/Windows/ARM64EC/Module.S index 46c18d884d..dcb01943cc 100644 --- a/Source/Windows/ARM64EC/Module.S +++ b/Source/Windows/ARM64EC/Module.S @@ -49,7 +49,8 @@ BeginSimulation: bl "#SyncThreadContext" ldr x17, [x18, #0x1788] // TEB->ChpeV2CpuAreaInfo ldr x16, [x17, #0x48] // ChpeV2CpuAreaInfo->EmulatorData[3] - DispatcherLoopTopEnterECFillSRA - br x16 // DispatcherLoopTopEnterECFillSRA(CPUArea:x17) + mov x10, #0 // Zero ENTRY_FILL_SRA_SINGLE_INST_REG to avoid single step + br x16 // DispatcherLoopTopEnterECFillSRA(SingleInst:x10, CPUArea:x17) // Called into by FEXCore // Expects the target code address in x9 diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index 02ae43046c..a24963412a 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -467,6 +467,8 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al } uint64_t HandleSyscall(FEXCore::Core::CpuStateFrame* Frame, FEXCore::HLE::SyscallArguments* Args) override { + ProcessPendingCrossProcessEmulatorWork(); + // Manually raise an exeption with the current JIT state packed into a native context, ntdll handles this and // reenters the JIT (see dlls/ntdll/signal_arm64ec.c in wine). uint64_t FPCR, FPSR; @@ -505,6 +507,7 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al } // namespace Exception extern "C" void SyncThreadContext(CONTEXT* Context) { + ProcessPendingCrossProcessEmulatorWork(); auto* Thread = GetCPUArea().ThreadState(); // All other EFlags bits are lost when converting to/from an ARM64EC context, so merge them in from the current JIT state. // This is advisable over dropping their values as thread suspend/resume uses this function, and that can happen at any point in guest code. @@ -591,7 +594,19 @@ bool ResetToConsistentStateImpl(EXCEPTION_RECORD* Exception, CONTEXT* GuestConte std::scoped_lock Lock(ThreadCreationMutex); if (InvalidationTracker->HandleRWXAccessViolation(FaultAddress)) { - LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress); + if (CTX->IsAddressInCodeBuffer(CPUArea.ThreadState(), NativeContext->Pc) && !CTX->IsCurrentBlockSingleInst(CPUArea.ThreadState()) && + CTX->IsAddressInCurrentBlock(CPUArea.ThreadState(), FaultAddress, 8)) { + // If we are not patching ourself (single inst block case) and patching the current block, this is inline SMC. Reconstruct the current context (before the SMC write) then single step the write to reduce it to regular SMC. + Exception::ReconstructThreadState(CPUArea.ThreadState(), *NativeContext); + LogMan::Msg::DFmt("Handled inline self-modifying code: pc: {:X} rip: {:X} fault: {:X}", NativeContext->Pc, + CPUArea.ThreadState()->CurrentFrame->State.rip, FaultAddress); + NativeContext->Pc = CPUArea.DispatcherLoopTopEnterECFillSRA(); + NativeContext->Sp = CPUArea.EmulatorStackBase(); + NativeContext->X10 = 1; // Set ENTRY_FILL_SRA_SINGLE_INST_REG to force a single step + } else { + LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress); + } + return true; } }