diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 3e101e95dc..e82dfe96ba 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -716,7 +716,7 @@ class OpDispatchBuilder final : public IREmitter { void VZEROOp(OpcodeArgs); // X87 Ops - OrderedNode* ReconstructFSW(); + OrderedNode* ReconstructFSW(OrderedNode* T = nullptr); // Returns new x87 stack top from FSW. OrderedNode* ReconstructX87StateFromFSW(OrderedNode* FSW); template diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 5f346032bd..60bbf8e85d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -89,24 +89,6 @@ void OpDispatchBuilder::SetX87Top(OrderedNode* Value) { _StoreContext(1, GPRClass, Value, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } -OrderedNode* OpDispatchBuilder::ReconstructFSW() { - // We must construct the FSW from our various bits - OrderedNode* FSW = _Constant(0); - auto Top = GetX87Top(); - FSW = _Bfi(OpSize::i64Bit, 3, 11, FSW, Top); - - auto C0 = GetRFLAG(FEXCore::X86State::X87FLAG_C0_LOC); - auto C1 = GetRFLAG(FEXCore::X86State::X87FLAG_C1_LOC); - auto C2 = GetRFLAG(FEXCore::X86State::X87FLAG_C2_LOC); - auto C3 = GetRFLAG(FEXCore::X86State::X87FLAG_C3_LOC); - - FSW = _Orlshl(OpSize::i64Bit, FSW, C0, 8); - FSW = _Orlshl(OpSize::i64Bit, FSW, C1, 9); - FSW = _Orlshl(OpSize::i64Bit, FSW, C2, 10); - FSW = _Orlshl(OpSize::i64Bit, FSW, C3, 14); - return FSW; -} - OrderedNode* OpDispatchBuilder::ReconstructX87StateFromFSW(OrderedNode* FSW) { auto Top = _Bfe(OpSize::i32Bit, 3, 11, FSW); SetX87Top(Top); @@ -343,10 +325,6 @@ void OpDispatchBuilder::X87LDSW(OpcodeArgs) { ReconstructX87StateFromFSW(NewFSW); } -void OpDispatchBuilder::X87FNSTSW(OpcodeArgs) { - StoreResult(GPRClass, Op, ReconstructFSW(), -1); -} - void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) { // 14 bytes for 16bit // 2 Bytes : FCW diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87New.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87New.cpp index 1cdf820a39..a8dd7d3d94 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87New.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87New.cpp @@ -541,13 +541,10 @@ void OpDispatchBuilder::F80SCALE(OpcodeArgs) { template void OpDispatchBuilder::X87ModifySTP(OpcodeArgs) { CurrentHeader->HasX87 = true; - auto orig_top = GetX87Top(); if (Inc) { - auto top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, orig_top, _Constant(1)), _Constant(7)); - SetX87Top(top); + _IncStackTop(); } else { - auto top = _And(OpSize::i32Bit, _Sub(OpSize::i32Bit, orig_top, _Constant(1)), _Constant(7)); - SetX87Top(top); + _DecStackTop(); } } @@ -582,4 +579,39 @@ void OpDispatchBuilder::F80F2XM1(OpcodeArgs) { _F80F2XM1Stack(); } +// Operations dealing with loading and storing environment pieces + + +// Reconstruct as a constant the Status Word of the FPU. +// We only track stack top and each of the code conditions (C flags) +// Top is 3 bits at bit 11. +// C0 is 1 bit at bit 8. +// C1 is 1 bit at bit 9. +// C2 is 1 bit at bit 10. +// C3 is 1 bit at bit 14. +// Optionally we can pass a pre calculated value for Top, otherwise we calculate it +// during the function runtime. +OrderedNode* OpDispatchBuilder::ReconstructFSW(OrderedNode* T) { + // We must construct the FSW from our various bits + OrderedNode* FSW = _Constant(0); + auto* Top = T ? T : GetX87Top(); + FSW = _Bfi(OpSize::i64Bit, 3, 11, FSW, Top); + + auto C0 = GetRFLAG(FEXCore::X86State::X87FLAG_C0_LOC); + auto C1 = GetRFLAG(FEXCore::X86State::X87FLAG_C1_LOC); + auto C2 = GetRFLAG(FEXCore::X86State::X87FLAG_C2_LOC); + auto C3 = GetRFLAG(FEXCore::X86State::X87FLAG_C3_LOC); + + FSW = _Orlshl(OpSize::i64Bit, FSW, C0, 8); + FSW = _Orlshl(OpSize::i64Bit, FSW, C1, 9); + FSW = _Orlshl(OpSize::i64Bit, FSW, C2, 10); + FSW = _Orlshl(OpSize::i64Bit, FSW, C3, 14); + return FSW; +} + +void OpDispatchBuilder::X87FNSTSW(OpcodeArgs) { + CurrentHeader->HasX87 = true; + StoreResult(GPRClass, Op, ReconstructFSW(_SyncStack()), -1); +} + } // namespace FEXCore::IR \ No newline at end of file diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index ea1aea37fa..3035df1898 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -2368,6 +2368,29 @@ } }, "F80": { + "GPR = SyncStack": { + "Desc": [ + "Synchronizes the stack environment with the MMX registers.", + "Returns the current stack top." + ], + "DestSize": "8", + "JITDispatch": false, + "HasSideEffects": true + }, + "IncStackTop": { + "Desc": [ + "Increase stack top-pointer." + ], + "JITDispatch": false, + "HasSideEffects": true + }, + "DecStackTop": { + "Desc": [ + "Decrease stack top-pointer." + ], + "JITDispatch": false, + "HasSideEffects": true + }, "FPR = PushStack FPR:$X80Src, OpSize:$OpSize, i1:$Float, u8:$LoadSize": { "Desc": [ "Pushes the provided source on to the x87 stack.", @@ -2415,51 +2438,46 @@ "DestSize": "16", "JITDispatch": false }, - "FPR = F80AddStack u8:$SrcStack1, u8:$SrcStack2": { + "F80AddStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80AddValue u8:$SrcStack, FPR:$X80Src": { + "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "16", "JITDispatch": false }, - "FPR = F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { + "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Subtracts the value in stack location TOP+$SrcStack2 from the value in stack location TOP+$SrcStack1.", - "The result is stored in stack location TOP+$DstStack and returned." + "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80SubValue u8:$SrcStack, FPR:$X80Src": { + "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Subtracts the value $X80Src from the value in stack location TOP+$SrcStack.", - "The result is stored in stack location TOP and returned." + "The result is stored in stack location TOP." ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80SubRValue FPR:$X80Src, u8:$SrcStack": { + "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ "Subtracts the value in stack location TOP+$SrcStack from the value $X80Src.", - "The result is stored in stack location TOP and returned." + "The result is stored in stack location TOP." ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { @@ -2471,54 +2489,49 @@ "DestSize": "16", "JITDispatch": false }, - "FPR = F80MulStack u8:$SrcStack1, u8:$SrcStack2": { + "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80MulValue u8:$SrcStack, FPR:$X80Src": { + "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "16", "JITDispatch": false }, - "FPR = F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { + "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Divides the value in stack location TOP+$SrcStack1 by the value in stack location TOP+$SrcStack2.", - "The result is stored in stack location TOP+$DstStack and returned.", + "The result is stored in stack location TOP+$DstStack.", "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80DivValue u8:$SrcStack, FPR:$X80Src": { + "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Divides the value in stack location TOP+$SrcStack by the value $X80Src.", "The result is stored in stack location TOP and returned.", "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, - "FPR = F80DivRValue FPR:$X80Src, u8:$SrcStack": { + "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ "Divides the value X80Src by the value in stack location TOP+$SrcStack.", - "The result is stored in stack location TOP and returned.", + "The result is stored in stack location TOP.", "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, - "DestSize": "16", "JITDispatch": false }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 345301093d..abea1b54dc 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -4,7 +4,7 @@ #include "Interface/IR/PassManager.h" #include #include -#include +#include #include #include @@ -20,55 +20,68 @@ namespace FEXCore::IR { template class FixedSizeStack { private: - fextl::vector buffer; + fextl::deque> buffer; public: FixedSizeStack() { - buffer.reserve(MaxSize); + buffer.resize(MaxSize, {false, T()}); } - void push(T value) { - if (buffer.size() == MaxSize) { + void push(const T& value) { + rotate(); + buffer.front() = {true, value}; + } + + // Rotate the elements with the direction controlled by Right + void rotate(bool Right = true) { + if (Right) { + // Right rotation + std::pair temp = std::move(buffer.back()); buffer.pop_back(); + buffer.push_front(std::move(temp)); + } else { + // Left rotation + std::pair temp = std::move(buffer.front()); + buffer.pop_front(); + buffer.push_back(std::move(temp)); } - buffer.emplace(buffer.begin(), std::move(value)); } void pop() { - if (!buffer.empty()) { - buffer.erase(buffer.begin()); - } + buffer.front() = {false, T()}; + rotate(false); } - std::optional top(size_t offset = 0) const { - if (!buffer.empty()) { - return buffer[offset]; - } - return std::nullopt; + const std::pair& top(size_t offset = 0) const { + return buffer[offset]; } - void setTop(T value, size_t offset = 0) { - if (!buffer.empty()) { - buffer[offset] = std::move(value); - return; - } - LOGMAN_THROW_A_FMT(offset == 0, "offset needs to be zero when setting empty stack"); - push(std::move(value)); + void setTop(T Value, size_t Offset = 0) { + buffer[Offset] = {true, Value}; } - inline size_t size() const { - return buffer.size(); + bool isValid(size_t Offset) const { + return buffer[Offset].first; } inline void clear() { - buffer.clear(); + for (auto& element : buffer) { + element = {false, T()}; // Set all elements as invalid + } } - void dump() { - for (size_t i = 0; i < buffer.size(); ++i) { - LogMan::Msg::DFmt("ST{}: 0x{:x}", i, (uintptr_t)(buffer[i].StackDataNode)); + void dump() const { + for (size_t i = 0; i < MaxSize; i++) { + const auto& [Valid, Element] = buffer[i]; + if (Valid) { + LogMan::Msg::DFmt("ST{}: 0x{:x}", i, (uintptr_t)(Element.StackDataNode)); + } } } + + constexpr size_t size() const { + return MaxSize; + } }; class X87StackOptimization final : public FEXCore::IR::Pass { @@ -99,6 +112,15 @@ class X87StackOptimization final : public FEXCore::IR::Pass { // Update Top value in slow path for a pop void UpdateTop4Pop_Slow(IREmitter* IREmit); void UpdateTop4Push_Slow(IREmitter* IREmit); + // Synchronizes the current simulated stack with the actual values. + // Returns a new value for Top, that's synchronized between the simulated stack + // and the actual FPU stack. + OrderedNode* SynchronizeStackValues(IREmitter* IREmit); + // Moves us from the fast to the slow path if ShouldMigrate is true. + void MigrateToSlowPathIf(IREmitter* IREmit, bool ShouldMigrate); + // Top Cache Management + OrderedNode* GetTopWithCache_Slow(IREmitter* IREmit); + void SetTopWithCache_Slow(IREmitter* IREmit, OrderedNode* Value); struct StackMemberInfo { IR::OpSize SourceDataSize; // Size of SourceDataNode @@ -109,15 +131,64 @@ class X87StackOptimization final : public FEXCore::IR::Pass { IR::OrderedNode* StackDataNode; // Reference to the data in the Stack. bool InterpretAsFloat {}; // True if this is a floating point value, false if integer }; + + // StackData, TopOffset and TopCache need to be always properly set to ensure + // it reflects the current state of the FPU. This sync only makes sense while + // taking the fast path. Once in the slow path, these don't make sense anymore + // and we are syncing everything. + // Index on vector is offset to top value at start of block + // If slow path is true, then StackData is always empty. FixedSizeStack StackData; + // Real top as an offset from stored top value (or the one at the beginning of the block) + // For example, if we start and push a value to our simulated stack, because we don't + // update top straight away the TopOffset is 1. + // If SlowPath is true, then TopOffset is always zero. + // int8_t TopOffset = 0; // TODO: MIGHT BE UNNECESSARY!!! + // Cached value for Top + // If slowpath is false, then TopCache is nullptr. + OrderedNode* TopCache = nullptr; + // Are we on the slow path? + // Once we enter the slow path, we never come out. + // This just simplifies the code atm. If there's a need to return to the fast path in the future + // we can implement that but I would expect that there would be very few cases where that's necessary. + // On the slow path TopCache is always the last obtained version of top. + // TopOffset is ignored + bool SlowPath = false; }; -OrderedNode* X87StackOptimization::GetX87Top(IREmitter* IREmit) { +void X87StackOptimization::MigrateToSlowPathIf(IREmitter* IREmit, bool ShouldMigrate) { + if (SlowPath) { + return; + } + if (!ShouldMigrate) { + return; + } + + SynchronizeStackValues(IREmit); + SlowPath = true; + StackData.clear(); +} + +OrderedNode* X87StackOptimization::GetTopWithCache_Slow(IREmitter* IREmit) { + if (TopCache) { + return TopCache; + } + + TopCache = GetX87Top(IREmit); + return TopCache; +} + +OrderedNode* X87StackOptimization::GetX87Top(IREmitter* IREmit) { // FIXME: Shouldnt use directly - use GetTopWithCache return IREmit->_LoadContext(1, GPRClass, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } -void X87StackOptimization::SetX87Top(IREmitter* IREmit, OrderedNode* Value) { +void X87StackOptimization::SetTopWithCache_Slow(IREmitter* IREmit, OrderedNode* Value) { + SetX87Top(IREmit, Value); + TopCache = Value; +} + +void X87StackOptimization::SetX87Top(IREmitter* IREmit, OrderedNode* Value) { // FIXME: Shouldnt use directly - use SetTopWithCache IREmit->_StoreContext(1, GPRClass, Value, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } @@ -138,7 +209,8 @@ OrderedNode* X87StackOptimization::LoadStackValueAtTop_Slow(IREmitter* IREmit) { // to find a way to manage this. OrderedNode* X87StackOptimization::LoadStackValueAtOffset_Slow(IREmitter* IREmit, uint8_t Offset) { // Load the current value from the x87 fpu stack - return IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, GetX87Top(IREmit), IREmit->_Constant(Offset)), 16, MMBaseOffset(), 16, FPRClass); + return IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, GetTopWithCache_Slow(IREmit), IREmit->_Constant(Offset)), 16, + MMBaseOffset(), 16, FPRClass); } void X87StackOptimization::StoreStackValueAtTop_Slow(IREmitter* IREmit, OrderedNode* Value) { @@ -146,7 +218,15 @@ void X87StackOptimization::StoreStackValueAtTop_Slow(IREmitter* IREmit, OrderedN } void X87StackOptimization::StoreStackValueAtOffset_Slow(IREmitter* IREmit, uint8_t Offset, OrderedNode* Value) { - IREmit->_StoreContextIndexed(Value, IREmit->_Add(OpSize::i32Bit, GetX87Top(IREmit), IREmit->_Constant(Offset)), 16, MMBaseOffset(), 16, FPRClass); + OrderedNode* IndexValue = GetTopWithCache_Slow(IREmit); + if (Offset != 0) { + IndexValue = IREmit->_Add(OpSize::i32Bit, IndexValue, IREmit->_Constant(Offset)); + } + + // mark it valid + SetX87ValidTag(IREmit, Value, true); + // store + IREmit->_StoreContextIndexed(Value, IndexValue, 16, MMBaseOffset(), 16, FPRClass); } void X87StackOptimization::UpdateTop4Pop_Slow(IREmitter* IREmit) { @@ -161,6 +241,55 @@ void X87StackOptimization::UpdateTop4Push_Slow(IREmitter* IREmit) { SetX87Top(IREmit, IREmit->_Sub(OpSize::i32Bit, top, IREmit->_Constant(1))); } +// We synchronize stack values in a few occasions but one of the most important of those, +// is when we move from fast to a slow path and need to make sure that the context is properly +// written. +OrderedNode* X87StackOptimization::SynchronizeStackValues(IREmitter* IREmit) { + if (SlowPath) { // Nothing to do here. + return GetTopWithCache_Slow(IREmit); + } + + OrderedNode* NewTop = nullptr; + auto CurrentIR = IREmit->ViewIR(); + + LogMan::Msg::DFmt("Writing stack to context\n"); + StackData.dump(); + for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { + for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { + if (IR::IsBlockExit(IROp->Op)) { + LogMan::Msg::DFmt("Found a block exit!\n"); + // Set write cursor to previous instruction + IREmit->SetWriteCursor(IREmit->UnwrapNode(CodeNode->Header.Previous)); + + // Store new top which is now the original top - the number of elements in stack. + // Careful with underflow wraparound. + auto* orig_top = GetTopWithCache_Slow(IREmit); + + auto mask = IREmit->_Constant(0x7); + NewTop = IREmit->_And(OpSize::i32Bit, IREmit->_Sub(OpSize::i32Bit, orig_top, IREmit->_Constant(StackData.size())), mask); + SetX87ValidTag(IREmit, NewTop, true); // FIXME(pmatos): this is wrong - we need to set tag for all values we are setting + SetTopWithCache_Slow(IREmit, NewTop); + + // Before leaving we need to write the current values in the stack to + // context so that the values are correct. Copy SourceDataNode in the + // stack to the respective mmX register. + for (size_t i = 0; i < StackData.size(); ++i) { + const auto& [Valid, StackMember] = StackData.top(i); + if (!Valid) { + continue; + } + IREmit->_StoreContextIndexed(StackMember.StackDataNode, IREmit->_Add(OpSize::i32Bit, NewTop, IREmit->_Constant(i)), 16, + MMBaseOffset(), 16, FPRClass); + } + + break; + } + } + } + + return NewTop ? NewTop : GetTopWithCache_Slow(IREmit); +} + bool X87StackOptimization::Run(IREmitter* IREmit) { FEXCORE_PROFILE_SCOPED("PassManager::x87StackOpt"); @@ -188,55 +317,64 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { LogMan::Msg::DFmt("OP_PUSHSTACK\n"); const auto* Op = IROp->C(); auto* SourceNode = CurrentIR.GetNode(Op->X80Src); - auto* SourceNodeOp = CurrentIR.GetOp(SourceNode); - auto SourceNodeSize = SourceNodeOp->Size; - StackData.push(StackMemberInfo { - .SourceDataSize = IR::SizeToOpSize(SourceNodeSize), - .StackDataSize = IR::SizeToOpSize(Op->LoadSize), - .SourceDataNode = nullptr, - .StackDataNode = SourceNode, - .InterpretAsFloat = Op->Float, - }); + if (SlowPath) { + UpdateTop4Push_Slow(IREmit); + StoreStackValueAtTop_Slow(IREmit, SourceNode); + } else { + auto* SourceNode = CurrentIR.GetNode(Op->X80Src); + auto* SourceNodeOp = CurrentIR.GetOp(SourceNode); + auto SourceNodeSize = SourceNodeOp->Size; + StackData.push(StackMemberInfo { + .SourceDataSize = IR::SizeToOpSize(SourceNodeSize), + .StackDataSize = IR::SizeToOpSize(Op->LoadSize), + .SourceDataNode = nullptr, + .StackDataNode = SourceNode, + .InterpretAsFloat = Op->Float, + }); + } LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); IREmit->Remove(CodeNode); // Remove PushStack - it's a nop, we just need to track the stack - Changed = true; + break; } case IR::OP_READSTACKVALUE: { LogMan::Msg::DFmt("OP_READSTACKVALUE\n"); const auto* Op = IROp->C(); - auto offset = Op->StackLocation; + auto Offset = Op->StackLocation; + const auto& [Valid, Value] = StackData.top(Offset); + MigrateToSlowPathIf(IREmit, !Valid); OrderedNode* NewValue = nullptr; - if (offset >= StackData.size()) { + if (SlowPath) { // slow path - NewValue = LoadStackValueAtOffset_Slow(IREmit, offset); + NewValue = LoadStackValueAtOffset_Slow(IREmit, Offset); } else { // fast path - NewValue = StackData.top(offset)->StackDataNode; + NewValue = StackData.top(Offset).second.StackDataNode; } auto CodeIter = CurrentIR.at(CodeNode); IREmit->ReplaceUsesWithAfter(CodeNode, NewValue, CodeIter); IREmit->Remove(CodeNode); - Changed = true; + break; } - case IR::OP_STORESTACKMEMORY: { + case IR::OP_STORESTACKMEMORY: { // stores top of stack in mem addr. LogMan::Msg::DFmt("OP_STORESTACKMEMORY\n"); const auto* Op = IROp->C(); + const auto& [Valid, Value] = StackData.top(); + MigrateToSlowPathIf(IREmit, !Valid); + OrderedNode* StackNode = nullptr; - if (StackData.size() == 0) { // slow path + if (SlowPath) { // slow path LogMan::Msg::DFmt("Slow path STORESTACKMEMORY\n"); - auto* top = GetX87Top(IREmit); - StackNode = IREmit->_LoadContextIndexed(top, 16, MMBaseOffset(), 16, FPRClass); + StackNode = LoadStackValueAtTop_Slow(IREmit); } else { // fast path LogMan::Msg::DFmt("Fast path STORESTACKMEMORY\n"); - StackNode = StackData.top()->StackDataNode; + StackNode = StackData.top().second.StackDataNode; } if (Op->StoreSize != 10) { // if it's not 80bits then convert @@ -245,44 +383,36 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto* AddrNode = CurrentIR.GetNode(Op->Addr); IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode); IREmit->Remove(CodeNode); - - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } - case IR::OP_STORESTACKTOSTACK: { + case IR::OP_STORESTACKTOSTACK: { // stores top of stack in another place in stack. LogMan::Msg::DFmt("OP_STORESTACKTOSTACK\n"); const auto* Op = IROp->C(); - auto offset = Op->StackLocation; + auto Offset = Op->StackLocation; - if (offset == 0) { // nop - IREmit->Remove(CodeNode); - break; - } + if (Offset != 0) { + const auto& [Valid, Value] = StackData.top(); + MigrateToSlowPathIf(IREmit, !Valid); - // Need to store st0 to stack location - basically a copy. - if (offset >= StackData.size()) { // slow path - LogMan::Msg::DFmt("Slow path STORESTACKTOSTACK\n"); - auto* top = GetX87Top(IREmit); - OrderedNode* StackNode = IREmit->_LoadContextIndexed(top, 16, MMBaseOffset(), 16, FPRClass); - IREmit->_StoreContextIndexed(StackNode, IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(offset)), 16, MMBaseOffset(), 16, FPRClass); - } else { // fast path - LogMan::Msg::DFmt("Fast path STORESTACKTOSTACK\n"); - StackData.setTop(*StackData.top(), offset); + // Need to store st0 to stack location - basically a copy. + if (SlowPath) { // slow path + LogMan::Msg::DFmt("Slow path STORESTACKTOSTACK\n"); + StoreStackValueAtOffset_Slow(IREmit, Offset, LoadStackValueAtTop_Slow(IREmit)); + } else { // fast path + LogMan::Msg::DFmt("Fast path STORESTACKTOSTACK\n"); + StackData.setTop(Value, Offset); + } } IREmit->Remove(CodeNode); - - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } case IR::OP_POPSTACKDESTROY: { LogMan::Msg::DFmt("OP_POPSTACKDESTROY\n"); - if (StackData.size() == 0) { // slow path + if (SlowPath) { // slow path LogMan::Msg::DFmt("Slow path POPSTACKDESTROY\n"); UpdateTop4Pop_Slow(IREmit); } else { @@ -291,9 +421,6 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { } IREmit->Remove(CodeNode); - - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } case IR::OP_F80ADDSTACK: { @@ -304,70 +431,64 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto StackOffset1 = Op->SrcStack1; auto StackOffset2 = Op->SrcStack2; - auto StackMember1 = StackData.top(StackOffset1); - auto StackMember2 = StackData.top(StackOffset2); + const auto& [Valid1, StackMember1] = StackData.top(StackOffset1); + const auto& [Valid2, StackMember2] = StackData.top(StackOffset2); - if (!StackMember1 || !StackMember2) { // Slow Path + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + if (SlowPath) { // Slow Path LogMan::Msg::DFmt("Slow path F80ADDSTACK\n"); - auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode1 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset1)), 16, MMBaseOffset(), 16, FPRClass); - auto StackNode2 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset2)), 16, MMBaseOffset(), 16, FPRClass); + auto* StackNode1 = LoadStackValueAtOffset_Slow(IREmit, StackOffset1); + auto* StackNode2 = LoadStackValueAtOffset_Slow(IREmit, StackOffset2); auto AddNode = IREmit->_F80Add(StackNode1, StackNode2); - IREmit->_StoreContextIndexed(AddNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, AddNode); } else { // Fast path LogMan::Msg::DFmt("Fast path F80ADDSTACK\n"); - auto AddNode = IREmit->_F80Add(StackMember1->StackDataNode, StackMember2->StackDataNode); + auto AddNode = IREmit->_F80Add(StackMember1.StackDataNode, StackMember2.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = AddNode, - .InterpretAsFloat = StackMember1->InterpretAsFloat}, + .InterpretAsFloat = StackMember1.InterpretAsFloat}, StackOffset1); } IREmit->Remove(CodeNode); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } case IR::OP_F80ADDVALUE: { LogMan::Msg::DFmt("F80ADDVALUE\n"); const auto* Op = IROp->C(); auto* ValueNode = CurrentIR.GetNode(Op->X80Src); - auto StackOffset = Op->SrcStack; - const auto& StackMember = StackData.top(StackOffset); - if (StackMember == std::nullopt) { // slow path + const auto& [Valid, StackMember] = StackData.top(StackOffset); + MigrateToSlowPathIf(IREmit, Valid); + + if (SlowPath) { // slow path LogMan::Msg::DFmt("Slow path F80ADDVALUE\n"); - auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode = LoadStackValueAtOffset_Slow(IREmit, StackOffset); auto AddNode = IREmit->_F80Add(ValueNode, StackNode); // Store it in stack TOP LogMan::Msg::DFmt("Storing node to TOP of stack\n"); - IREmit->_Print(top); - IREmit->_StoreContextIndexed(AddNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, AddNode); } else { LogMan::Msg::DFmt("Fast path F80ADDVALUE\n"); - auto AddNode = IREmit->_F80Add(ValueNode, StackMember->StackDataNode); + auto AddNode = IREmit->_F80Add(ValueNode, StackMember.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = AddNode, - .InterpretAsFloat = StackMember->InterpretAsFloat}); + .InterpretAsFloat = StackMember.InterpretAsFloat}); LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); StackData.dump(); } @@ -385,32 +506,30 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto StackOffset1 = Op->SrcStack1; auto StackOffset2 = Op->SrcStack2; - auto StackMember1 = StackData.top(StackOffset1); - auto StackMember2 = StackData.top(StackOffset2); + const auto& [Valid1, StackMember1] = StackData.top(StackOffset1); + const auto& [Valid2, StackMember2] = StackData.top(StackOffset2); - if (!StackMember1 || !StackMember2) { // Slow Path + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + if (SlowPath) { // Slow Path LogMan::Msg::DFmt("Slow path F80SUBSTACK\n"); - auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode1 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset1)), 16, MMBaseOffset(), 16, FPRClass); - auto StackNode2 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset2)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode1 = LoadStackValueAtOffset_Slow(IREmit, StackOffset1); + auto StackNode2 = LoadStackValueAtOffset_Slow(IREmit, StackOffset2); auto SubNode = IREmit->_F80Sub(StackNode1, StackNode2); - IREmit->_StoreContextIndexed(SubNode, IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackDest)), 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, SubNode); } else { // Fast path LogMan::Msg::DFmt("Fast path F80SUBSTACK\n"); - auto SubNode = IREmit->_F80Sub(StackMember1->StackDataNode, StackMember2->StackDataNode); + auto SubNode = IREmit->_F80Sub(StackMember1.StackDataNode, StackMember2.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = SubNode, - .InterpretAsFloat = StackMember1->InterpretAsFloat}, + .InterpretAsFloat = StackMember1.InterpretAsFloat}, StackDest); } @@ -427,15 +546,15 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto* ValueNode = CurrentIR.GetNode(Op->X80Src); auto StackOffset = Op->SrcStack; - const auto& StackMember = StackData.top(StackOffset); + const auto& [Valid, StackMember] = StackData.top(StackOffset); - if (StackMember == std::nullopt) { // slow path + MigrateToSlowPathIf(IREmit, !Valid); + + if (SlowPath) { // slow path LogMan::Msg::DFmt("Slow path F80SUBVALUE\n"); - auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode = LoadStackValueAtOffset_Slow(IREmit, StackOffset); OrderedNode* SubNode = nullptr; if (IROp->Op == IR::OP_F80SUBVALUE) { @@ -445,25 +564,21 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { } // Store it in stack TOP - LogMan::Msg::DFmt("Storing node to TOP of stack\n"); - IREmit->_Print(top); - IREmit->_StoreContextIndexed(SubNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, SubNode); } else { LogMan::Msg::DFmt("Fast path F80SUBVALUE\n"); OrderedNode* SubNode = nullptr; if (IROp->Op == IR::OP_F80SUBVALUE) { - SubNode = IREmit->_F80Sub(StackMember->StackDataNode, ValueNode); + SubNode = IREmit->_F80Sub(StackMember.StackDataNode, ValueNode); } else { - SubNode = IREmit->_F80Sub(ValueNode, StackMember->StackDataNode); // IR::OP_F80SUBRVALUE + SubNode = IREmit->_F80Sub(ValueNode, StackMember.StackDataNode); // IR::OP_F80SUBRVALUE } // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = SubNode, - .InterpretAsFloat = StackMember->InterpretAsFloat}); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); Changed = true; @@ -479,38 +594,36 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto StackOffset1 = Op->SrcStack1; auto StackOffset2 = Op->SrcStack2; - auto StackMember1 = StackData.top(StackOffset1); - auto StackMember2 = StackData.top(StackOffset2); + const auto& [Valid1, StackMember1] = StackData.top(StackOffset1); + const auto& [Valid2, StackMember2] = StackData.top(StackOffset2); - if (!StackMember1 || !StackMember2) { // Slow Path + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + + if (SlowPath) { // Slow Path LogMan::Msg::DFmt("Slow path F80DIVSTACK\n"); auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode1 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset1)), 16, MMBaseOffset(), 16, FPRClass); - auto StackNode2 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset2)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode1 = LoadStackValueAtOffset_Slow(IREmit, StackOffset1); + auto StackNode2 = LoadStackValueAtOffset_Slow(IREmit, StackOffset2); auto DivNode = IREmit->_F80Div(StackNode1, StackNode2); - IREmit->_StoreContextIndexed(DivNode, IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackDest)), 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, DivNode); } else { // Fast path LogMan::Msg::DFmt("Fast path F80DIVSTACK\n"); - auto AddNode = IREmit->_F80Div(StackMember1->StackDataNode, StackMember2->StackDataNode); + auto DivNode = IREmit->_F80Div(StackMember1.StackDataNode, StackMember2.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, - .StackDataNode = AddNode, - .InterpretAsFloat = StackMember1->InterpretAsFloat}, + .StackDataNode = DivNode, + .InterpretAsFloat = StackMember1.InterpretAsFloat}, StackDest); } IREmit->Remove(CodeNode); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } @@ -521,15 +634,16 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto* ValueNode = CurrentIR.GetNode(Op->X80Src); auto StackOffset = Op->SrcStack; - const auto& StackMember = StackData.top(StackOffset); + const auto& [Valid, StackMember] = StackData.top(StackOffset); - if (StackMember == std::nullopt) { // slow path + MigrateToSlowPathIf(IREmit, !Valid); + + if (SlowPath) { // slow path LogMan::Msg::DFmt("Slow path F80DIVVALUE\n"); auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode = LoadStackValueAtOffset_Slow(IREmit, StackOffset); OrderedNode* DivNode = nullptr; if (IROp->Op == IR::OP_F80DIVVALUE) { @@ -539,28 +653,23 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { } // Store it in stack TOP - LogMan::Msg::DFmt("Storing node to TOP of stack\n"); - IREmit->_Print(top); - IREmit->_StoreContextIndexed(DivNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, DivNode); } else { LogMan::Msg::DFmt("Fast path F80DIVVALUE\n"); OrderedNode* DivNode = nullptr; if (IROp->Op == IR::OP_F80DIVVALUE) { - DivNode = IREmit->_F80Div(StackMember->StackDataNode, ValueNode); + DivNode = IREmit->_F80Div(StackMember.StackDataNode, ValueNode); } else { - DivNode = IREmit->_F80Div(ValueNode, StackMember->StackDataNode); // IR::OP_F80SUBRVALUE + DivNode = IREmit->_F80Div(ValueNode, StackMember.StackDataNode); // IR::OP_F80SUBRVALUE } // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = DivNode, - .InterpretAsFloat = StackMember->InterpretAsFloat}); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } @@ -572,37 +681,35 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto StackOffset1 = Op->SrcStack1; auto StackOffset2 = Op->SrcStack2; - auto StackMember1 = StackData.top(StackOffset1); - auto StackMember2 = StackData.top(StackOffset2); + const auto& [Valid1, StackMember1] = StackData.top(StackOffset1); + const auto& [Valid2, StackMember2] = StackData.top(StackOffset2); + + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); - if (!StackMember1 || !StackMember2) { // Slow Path + if (SlowPath) { // Slow Path LogMan::Msg::DFmt("Slow path F80MULSTACK\n"); auto* top = GetX87Top(IREmit); // Load the current value from the x87 fpu stack - auto StackNode1 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset1)), 16, MMBaseOffset(), 16, FPRClass); - auto StackNode2 = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset2)), 16, MMBaseOffset(), 16, FPRClass); + auto StackNode1 = LoadStackValueAtOffset_Slow(IREmit, StackOffset1); + auto StackNode2 = LoadStackValueAtOffset_Slow(IREmit, StackOffset2); auto MulNode = IREmit->_F80Mul(StackNode1, StackNode2); - IREmit->_StoreContextIndexed(MulNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, MulNode); } else { // Fast Path LogMan::Msg::DFmt("Fast path F80MULSTACK\n"); - auto MulNode = IREmit->_F80Mul(StackMember1->StackDataNode, StackMember2->StackDataNode); + auto MulNode = IREmit->_F80Mul(StackMember1.StackDataNode, StackMember2.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = MulNode, - .InterpretAsFloat = StackMember1->InterpretAsFloat}, + .InterpretAsFloat = StackMember1.InterpretAsFloat}, StackOffset1); } IREmit->Remove(CodeNode); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); break; } @@ -612,32 +719,27 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto* ValueNode = CurrentIR.GetNode(Op->X80Src); auto StackOffset = Op->SrcStack; - const auto& StackMember = StackData.top(StackOffset); + const auto& [Valid, StackMember] = StackData.top(StackOffset); - if (StackMember == std::nullopt) { // slow path - LogMan::Msg::DFmt("Slow path F80MulVALUE\n"); + MigrateToSlowPathIf(IREmit, !Valid); - auto* top = GetX87Top(IREmit); + if (SlowPath) { // slow path + LogMan::Msg::DFmt("Slow path F80MulVALUE\n"); // Load the current value from the x87 fpu stack - auto StackNode = - IREmit->_LoadContextIndexed(IREmit->_Add(OpSize::i32Bit, top, IREmit->_Constant(StackOffset)), 16, MMBaseOffset(), 16, FPRClass); - auto AddNode = IREmit->_F80Mul(ValueNode, StackNode); + auto StackNode = LoadStackValueAtOffset_Slow(IREmit, StackOffset); + auto MulNode = IREmit->_F80Mul(ValueNode, StackNode); // Store it in stack TOP - LogMan::Msg::DFmt("Storing node to TOP of stack\n"); - IREmit->_Print(top); - IREmit->_StoreContextIndexed(AddNode, top, 16, MMBaseOffset(), 16, FPRClass); + StoreStackValueAtTop_Slow(IREmit, MulNode); } else { LogMan::Msg::DFmt("Fast path F80MULVALUE\n"); - auto MulNode = IREmit->_F80Mul(ValueNode, StackMember->StackDataNode); + auto MulNode = IREmit->_F80Mul(ValueNode, StackMember.StackDataNode); // Store it in the stack - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = MulNode, - .InterpretAsFloat = StackMember->InterpretAsFloat}); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - StackData.dump(); + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); Changed = true; @@ -648,25 +750,29 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { LogMan::Msg::DFmt("F80Xchange\n"); const auto* Op = IROp->C(); auto offset = Op->SrcStack; - const auto& StackMember = StackData.top(offset); - if (StackMember == std::nullopt) { // slow path + + const auto& [ValidTop, StackTop] = StackData.top(); + const auto& [Valid, StackMember] = StackData.top(offset); + + MigrateToSlowPathIf(IREmit, !ValidTop || !Valid); + + if (SlowPath) { // slow path auto* ValueTop = LoadStackValueAtTop_Slow(IREmit); auto* ValueOffset = LoadStackValueAtOffset_Slow(IREmit, offset); StoreStackValueAtTop_Slow(IREmit, ValueOffset); StoreStackValueAtOffset_Slow(IREmit, offset, ValueTop); } else { // fast path - auto tmp = StackData.top(); - StackData.setTop(*StackMember); - StackData.setTop(*tmp, offset); + StackData.setTop(StackMember); + StackData.setTop(StackTop, offset); } IREmit->Remove(CodeNode); - Changed = true; + break; } case OP_F80STACKCHANGESIGN: { LogMan::Msg::DFmt("F80ChangeSign\n"); - const auto& StackMember = StackData.top(); + const auto& [Valid, StackMember] = StackData.top(); // We need a couple of intermediate instructions to change the sign // of a value @@ -675,26 +781,28 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { OrderedNode* data = IREmit->_VCastFromGPR(16, 8, low); data = IREmit->_VInsGPR(16, 8, 1, data, high); - if (StackMember == std::nullopt) { // slow path + MigrateToSlowPathIf(IREmit, Valid); + + if (SlowPath) { // slow path auto* value = LoadStackValueAtTop_Slow(IREmit); // Negate value auto result = IREmit->_VXor(16, 1, value, data); StoreStackValueAtTop_Slow(IREmit, result); } else { // fast path - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, - .StackDataNode = IREmit->_VXor(16, 1, StackMember->StackDataNode, data), - .InterpretAsFloat = StackMember->InterpretAsFloat}); + .StackDataNode = IREmit->_VXor(16, 1, StackMember.StackDataNode, data), + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } case IR::OP_F80STACKABS: { LogMan::Msg::DFmt("F80Abs"); - const auto& StackMember = StackData.top(); + const auto& [Valid, StackMember] = StackData.top(); + // Intermediate insts auto low = IREmit->_Constant(~0ULL); @@ -702,31 +810,34 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { OrderedNode* data = IREmit->_VCastFromGPR(16, 8, low); data = IREmit->_VInsGPR(16, 8, 1, data, high); - if (StackMember == std::nullopt) { + MigrateToSlowPathIf(IREmit, Valid); + + if (SlowPath) { // slow path auto* value = LoadStackValueAtTop_Slow(IREmit); auto result = IREmit->_VAnd(16, 1, value, data); StoreStackValueAtTop_Slow(IREmit, result); } else { // fast path - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, - .StackDataNode = IREmit->_VAnd(16, 1, StackMember->StackDataNode, data), - .InterpretAsFloat = StackMember->InterpretAsFloat}); + .StackDataNode = IREmit->_VAnd(16, 1, StackMember.StackDataNode, data), + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } case IR::OP_F80STACKFYL2X: { LogMan::Msg::DFmt("OP_F80STACKFYL2X"); - const auto& StackMember1 = StackData.top(); - const auto& StackMember2 = StackData.top(1); + const auto& [Valid1, StackMember1] = StackData.top(); + const auto& [Valid2, StackMember2] = StackData.top(1); - if (StackMember1 == std::nullopt || StackMember2 == std::nullopt) { + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + + if (SlowPath) { // slow path auto* st0 = LoadStackValueAtTop_Slow(IREmit); auto* st1 = LoadStackValueAtOffset_Slow(IREmit, 1); @@ -737,15 +848,14 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { } else { // fast path StackData.pop(); // we need to write the result st1, so if popping and setTop has the same behaviour - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, - .StackDataNode = IREmit->_F80FYL2X(StackMember1->StackDataNode, StackMember2->StackDataNode), - .InterpretAsFloat = StackMember1->InterpretAsFloat}); + .StackDataNode = IREmit->_F80FYL2X(StackMember1.StackDataNode, StackMember2.StackDataNode), + .InterpretAsFloat = StackMember1.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } @@ -753,23 +863,25 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { LogMan::Msg::DFmt("OP_F80CMPSTACK"); const auto* Op = IROp->C(); auto offset = Op->SrcStack; - const auto& StackMember1 = StackData.top(); - const auto& StackMember2 = StackData.top(offset); + const auto& [Valid1, StackMember1] = StackData.top(); + const auto& [Valid2, StackMember2] = StackData.top(offset); + + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + OrderedNode* CmpNode = nullptr; - if (StackMember1 == std::nullopt || StackMember2 == std::nullopt) { + if (SlowPath) { // slow path auto* StackValue1 = LoadStackValueAtTop_Slow(IREmit); auto* StackValue2 = LoadStackValueAtOffset_Slow(IREmit, offset); CmpNode = IREmit->_F80Cmp(StackValue1, StackValue2, Op->Flags); } else { // fast path - CmpNode = IREmit->_F80Cmp(StackMember1->StackDataNode, StackMember2->StackDataNode, Op->Flags); + CmpNode = IREmit->_F80Cmp(StackMember1.StackDataNode, StackMember2.StackDataNode, Op->Flags); } IREmit->Remove(CodeNode); IREmit->ReplaceUsesWithAfter(CodeNode, CmpNode, CodeNode); - Changed = true; break; } @@ -777,30 +889,33 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { LogMan::Msg::DFmt("OP_F80CMPVALUE"); const auto* Op = IROp->C(); const auto& Value = CurrentIR.GetNode(Op->X80Src); - const auto& StackMember = StackData.top(); + const auto& [Valid, StackMember] = StackData.top(); + + MigrateToSlowPathIf(IREmit, !Valid); OrderedNode* CmpNode = nullptr; - if (StackMember == std::nullopt) { + if (SlowPath) { // slow path auto* StackValue = LoadStackValueAtTop_Slow(IREmit); CmpNode = IREmit->_F80Cmp(StackValue, Value, Op->Flags); } else { // fast path - CmpNode = IREmit->_F80Cmp(StackMember->StackDataNode, Value, Op->Flags); + CmpNode = IREmit->_F80Cmp(StackMember.StackDataNode, Value, Op->Flags); } IREmit->Remove(CodeNode); IREmit->ReplaceUsesWithAfter(CodeNode, CmpNode, CodeNode); - Changed = true; break; } case IR::OP_F80ATANSTACK: { LogMan::Msg::DFmt("OP_F80ATANSTACK"); - const auto& StackMember1 = StackData.top(); - const auto& StackMember2 = StackData.top(1); + const auto& [Valid1, StackMember1] = StackData.top(); + const auto& [Valid2, StackMember2] = StackData.top(1); - if (StackMember1 == std::nullopt || StackMember2 == std::nullopt) { + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + + if (SlowPath) { // slow path auto* st0 = LoadStackValueAtTop_Slow(IREmit); auto* st1 = LoadStackValueAtOffset_Slow(IREmit, 1); @@ -808,27 +923,28 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { StoreStackValueAtOffset_Slow(IREmit, 1, Result); } else { // fast path - auto* st0 = StackMember1->StackDataNode; - auto* st1 = StackMember2->StackDataNode; + auto* st0 = StackMember1.StackDataNode; + auto* st1 = StackMember2.StackDataNode; - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = IREmit->_F80ATAN(st1, st0), - .InterpretAsFloat = StackMember1->InterpretAsFloat}, + .InterpretAsFloat = StackMember1.InterpretAsFloat}, 1); } IREmit->Remove(CodeNode); - Changed = true; break; } case IR::OP_F80XTRACTSTACK: { LogMan::Msg::DFmt("OP_F80XTRACTSTACK"); - const auto& StackMember = StackData.top(); + const auto& [Valid, StackMember] = StackData.top(); + + MigrateToSlowPathIf(IREmit, !Valid); - if (StackMember == std::nullopt) { + if (SlowPath) { // slow path auto* st0 = LoadStackValueAtTop_Slow(IREmit); @@ -845,44 +961,44 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { auto exp = IREmit->_F80XTRACT_EXP(st0); auto sig = IREmit->_F80XTRACT_SIG(st0); - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = exp, - .InterpretAsFloat = StackMember->InterpretAsFloat}); - StackData.push(StackMemberInfo {.SourceDataSize = StackMember->SourceDataSize, - .StackDataSize = StackMember->StackDataSize, + .InterpretAsFloat = StackMember.InterpretAsFloat}); + StackData.push(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, .SourceDataNode = nullptr, .StackDataNode = sig, - .InterpretAsFloat = StackMember->InterpretAsFloat}); + .InterpretAsFloat = StackMember.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } case IR::OP_F80FPREMSTACK: { LogMan::Msg::DFmt("F80FPREMStack"); - const auto& StackMember1 = StackData.top(); - const auto& StackMember2 = StackData.top(1); + const auto& [Valid1, StackMember1] = StackData.top(); + const auto& [Valid2, StackMember2] = StackData.top(1); - if (StackMember1 == std::nullopt || StackMember2 == std::nullopt) { + MigrateToSlowPathIf(IREmit, !Valid1 || !Valid2); + + if (SlowPath) { // slow path auto* st0 = LoadStackValueAtTop_Slow(IREmit); auto* st1 = LoadStackValueAtOffset_Slow(IREmit, 1); StoreStackValueAtTop_Slow(IREmit, IREmit->_F80FPREM(st0, st1)); } else { // fast path - StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1->SourceDataSize, - .StackDataSize = StackMember1->StackDataSize, + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember1.SourceDataSize, + .StackDataSize = StackMember1.StackDataSize, .SourceDataNode = nullptr, - .StackDataNode = IREmit->_F80FPREM(StackMember1->StackDataNode, StackMember2->StackDataNode), - .InterpretAsFloat = StackMember1->InterpretAsFloat}); + .StackDataNode = IREmit->_F80FPREM(StackMember1.StackDataNode, StackMember2.StackDataNode), + .InterpretAsFloat = StackMember1.InterpretAsFloat}); } IREmit->Remove(CodeNode); - Changed = true; break; } @@ -1022,60 +1138,50 @@ bool X87StackOptimization::Run(IREmitter* IREmit) { break; } - default: break; + case IR::OP_SYNCSTACK: { + LogMan::Msg::DFmt("SYNCSTACK"); + + OrderedNode* NewTop = SynchronizeStackValues(IREmit); + IREmit->ReplaceUsesWithAfter(CodeNode, NewTop, CodeNode); + IREmit->Remove(CodeNode); + Changed = true; + + break; } - } - } - // We need to write the registers before any branch in the block, - // so loop until a branch instruction is found. Add instructions _before_ - // the branch instruction. - // TODO(pmatos): we don't need to do this if we don't have any followup - // blocks. How can we check that? OTOH, not writing to the proper registers - // might screw up testing that expects the values to be in the stack registers - // at the end, so maybe we need a testing flag that forces the writing of this - // data to the context. - if (StackData.size() != 0) { - LogMan::Msg::DFmt("Writing stack to context\n"); - StackData.dump(); - for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { - for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { - if (IR::IsBlockExit(IROp->Op)) { - LogMan::Msg::DFmt("Found a block exit!\n"); - // Set write cursor to previous instruction - IREmit->SetWriteCursor(IREmit->UnwrapNode(CodeNode->Header.Previous)); - - // Store new top which is now the original top - the number of elements in stack. - // Careful with underflow wraparound. - auto* orig_top = GetX87Top(IREmit); - IREmit->_Print(orig_top); - - auto mask = IREmit->_Constant(0x7); - auto new_top = IREmit->_And(OpSize::i32Bit, IREmit->_Sub(OpSize::i32Bit, orig_top, IREmit->_Constant(StackData.size())), mask); - SetX87ValidTag(IREmit, new_top, true); - SetX87Top(IREmit, new_top); - IREmit->_Print(new_top); - - // Before leaving we need to write the current values in the stack to - // context so that the values are correct. Copy SourceDataNode in the - // stack to the respective mmX register. - for (size_t i = 0; i < StackData.size(); ++i) { - LogMan::Msg::DFmt("Writing stack member {} to context TOP+{}", i, i); - Changed = true; - const auto StackMember = StackData.top(i); - LOGMAN_THROW_A_FMT(StackMember != std::nullopt, "Stack does not have enough elements"); - auto* Node = StackMember->StackDataNode; - IREmit->_StoreContextIndexed(Node, IREmit->_Add(OpSize::i32Bit, new_top, IREmit->_Constant(i)), 16, MMBaseOffset(), 16, FPRClass); - } + case IR::OP_INCSTACKTOP: { + if (SlowPath) { + UpdateTop4Pop_Slow(IREmit); + } else { + StackData.rotate(true); + } + IREmit->Remove(CodeNode); + Changed = True; + break; + } - break; + case IR::OP_DECSTACKTOP: { + if (SlowPath) { + UpdateTop4Push_Slow(IREmit); + } else { + StackData.rotate(); } + IREmit->Remove(CodeNode); + Changed = true; + break; + } + + default: break; } } } + // We need to write the registers before any branch in the block, + // so loop until a branch instruction is found. Add instructions _before_ + // the branch instruction. + SynchronizeStackValues(IREmit); IREmit->SetWriteCursor(OriginalWriteCursor); - return Changed; + return true; } fextl::unique_ptr CreateX87StackOptimizationPass() {