From 214f04a104c1c384fca7ff9bb8204368e8a8bd9d Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Wed, 20 Nov 2024 10:50:40 +0100 Subject: [PATCH] Generate SVE for 80bit stores when possible Fixes #4126 --- .../Source/Interface/Core/JIT/MemoryOps.cpp | 11 +++++++ FEXCore/Source/Interface/IR/PassManager.cpp | 2 +- FEXCore/Source/Interface/IR/Passes.h | 3 +- .../IR/Passes/x87StackOptimizationPass.cpp | 29 ++++++++++++------- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index d9ce167ec8..f4dbfd1764 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -1536,6 +1536,17 @@ DEF_OP(StoreMem) { str(Src.D(), MemSrc); break; } + case IR::OpSize::f80Bit: { + LOGMAN_THROW_A_FMT(HostSupportsSVE128, "80-bit stores are only supported with SVE128 support"); + const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); + // We need to generate the predicate register to copy the 10-byte value + // FIXME: don't hardcode p5 + ARMEmitter::PRegister preg = ARMEmitter::PReg::p5; + mov(TMP1, 10); + whilelt(ARMEmitter::SubRegSize::i8Bit, preg, ARMEmitter::XReg::zr, TMP1); + st1b(Src.Z(), preg, MemSrc); + break; + } case IR::OpSize::i128Bit: { str(Src.Q(), MemSrc); break; diff --git a/FEXCore/Source/Interface/IR/PassManager.cpp b/FEXCore/Source/Interface/IR/PassManager.cpp index 5072364a68..fccbd41c56 100644 --- a/FEXCore/Source/Interface/IR/PassManager.cpp +++ b/FEXCore/Source/Interface/IR/PassManager.cpp @@ -70,7 +70,7 @@ void PassManager::AddDefaultPasses(FEXCore::Context::ContextImpl* ctx) { FEX_CONFIG_OPT(DisablePasses, O0); if (!DisablePasses()) { - InsertPass(CreateX87StackOptimizationPass()); + InsertPass(CreateX87StackOptimizationPass(ctx->HostFeatures)); InsertPass(CreateConstProp(ctx->HostFeatures.SupportsTSOImm9, &ctx->CPUID)); InsertPass(CreateDeadFlagCalculationEliminination()); } diff --git a/FEXCore/Source/Interface/IR/Passes.h b/FEXCore/Source/Interface/IR/Passes.h index a8a3d1cb64..fa230264fc 100644 --- a/FEXCore/Source/Interface/IR/Passes.h +++ b/FEXCore/Source/Interface/IR/Passes.h @@ -5,6 +5,7 @@ namespace FEXCore { class CPUIDEmu; +struct HostFeatures; } namespace FEXCore::Utils { @@ -19,7 +20,7 @@ class RegisterAllocationData; fextl::unique_ptr CreateConstProp(bool SupportsTSOImm9, const FEXCore::CPUIDEmu* CPUID); fextl::unique_ptr CreateDeadFlagCalculationEliminination(); fextl::unique_ptr CreateRegisterAllocationPass(); -fextl::unique_ptr CreateX87StackOptimizationPass(); +fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures&); namespace Validation { fextl::unique_ptr CreateIRValidation(); diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index f342006530..ca0798a6d9 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -3,9 +3,10 @@ #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" -#include -#include -#include +#include "FEXCore/IR/IR.h" +#include "FEXCore/Utils/Profiler.h" +#include "FEXCore/fextl/deque.h" +#include "FEXCore/Core/HostFeatures.h" #include #include @@ -146,13 +147,15 @@ class FixedSizeStack { class X87StackOptimization final : public Pass { public: - X87StackOptimization() { + X87StackOptimization(const FEXCore::HostFeatures& Features) + : Features(Features) { FEX_CONFIG_OPT(ReducedPrecision, X87REDUCEDPRECISION); ReducedPrecisionMode = ReducedPrecision; } void Run(IREmitter* Emit) override; private: + const FEXCore::HostFeatures& Features; bool ReducedPrecisionMode; // Helpers @@ -820,11 +823,15 @@ void X87StackOptimization::Run(IREmitter* Emit) { StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - // For X87 extended doubles, split before storing - IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); - auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); - auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8)); - IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit); + if (Features.SupportsSVE128) { + IREmit->_StoreMem(FPRClass, OpSize::f80Bit, AddrNode, StackNode); + } else { + // For X87 extended doubles, split before storing + IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); + auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); + auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8)); + IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit); + } } else { IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode); } @@ -1025,7 +1032,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { return; } -fextl::unique_ptr CreateX87StackOptimizationPass() { - return fextl::make_unique(); +fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures& Features) { + return fextl::make_unique(Features); } } // namespace FEXCore::IR