Skip to content

Commit

Permalink
WIP: Loads
Browse files Browse the repository at this point in the history
  • Loading branch information
pmatos committed Dec 2, 2024
1 parent e01a568 commit af05434
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 18 deletions.
18 changes: 9 additions & 9 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ namespace x64 {

// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5.
constexpr std::array<ARMEmitter::PRegister, 6> PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2,
ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand Down Expand Up @@ -111,9 +111,9 @@ namespace x64 {

// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5.
constexpr std::array<ARMEmitter::PRegister, 6> PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2,
ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand Down Expand Up @@ -248,9 +248,9 @@ namespace x32 {

// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5.
constexpr std::array<ARMEmitter::PRegister, 6> PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2,
ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

// All are caller saved
constexpr std::array<ARMEmitter::VRegister, 8> SRAFPR = {
Expand Down
31 changes: 31 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1591,6 +1591,37 @@ DEF_OP(StoreMemPredicate) {
}
}

DEF_OP(LoadMemPredicate) {
const auto Op = IROp->C<IR::IROp_StoreMemPredicate>();
const auto Dst = GetVReg(Node);
const auto Predicate = GetPReg(Op->Mask.ID());
const auto MemReg = GetReg(Op->Addr.ID());

LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemPredicate needs SVE support");

const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0);

switch (IROp->ElementSize) {
case IR::OpSize::i8Bit: {
ld1b<ARMEmitter::SubRegSize::i8Bit>(Dst.Z(), Predicate.Zeroing(), MemDst);
break;
}
case IR::OpSize::i16Bit: {
ld1h<ARMEmitter::SubRegSize::i16Bit>(Dst.Z(), Predicate.Zeroing(), MemDst);
break;
}
case IR::OpSize::i32Bit: {
ld1w<ARMEmitter::SubRegSize::i32Bit>(Dst.Z(), Predicate.Zeroing(), MemDst);
break;
}
case IR::OpSize::i64Bit: {
ld1d(Dst.Z(), Predicate.Zeroing(), MemDst);
break;
}
default: break;
}
}

DEF_OP(StoreMemPair) {
const auto Op = IROp->C<IR::IROp_StoreMemPair>();
const auto OpSize = IROp->Size;
Expand Down
28 changes: 19 additions & 9 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ desc: Handles x86/64 ops to IR, no-pf opt, local-flags opt
$end_info$
*/

#include "FEXCore/Core/HostFeatures.h"
#include "FEXCore/Utils/Telemetry.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/OpcodeDispatcher.h"
Expand Down Expand Up @@ -4309,10 +4310,15 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
if ((IsOperandMem(Operand, true) && LoadData) || ForceLoad) {
if (OpSize == OpSize::f80Bit) {
Ref MemSrc = LoadEffectiveAddress(A, true);

// For X87 extended doubles, Split the load.
auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align);
return _VLoadVectorElement(OpSize::i128Bit, OpSize::i16Bit, Res, 4, _Add(OpSize::i64Bit, MemSrc, _InlineConstant(8)));
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
return _LoadMemPredicate(FPRClass, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align);
return _VLoadVectorElement(OpSize::i128Bit, OpSize::i16Bit, Res, 4, _Add(OpSize::i64Bit, MemSrc, _InlineConstant(8)));
}
}

return _LoadMemAutoTSO(Class, OpSize, A, Align == OpSize::iInvalid ? OpSize : Align);
Expand Down Expand Up @@ -4438,12 +4444,16 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */);

if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
LOGMAN_THROW_A_FMT(false, "trying to store 80bit with SVE broken atm");
} else {
Ref MemStoreDst = LoadEffectiveAddress(A, true);

// For X87 extended doubles, split before storing
_StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align);
auto Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 1);
_StoreMem(GPRClass, OpSize::i16Bit, Upper, MemStoreDst, _Constant(8), std::min(Align, OpSize::i64Bit), MEM_OFFSET_SXTX, 1);
// For X87 extended doubles, split before storing
_StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align);
auto Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 1);
_StoreMem(GPRClass, OpSize::i16Bit, Upper, MemStoreDst, _Constant(8), std::min(Align, OpSize::i64Bit), MEM_OFFSET_SXTX, 1);
}
} else {
_StoreMemAutoTSO(Class, OpSize, A, Src, Align == OpSize::iInvalid ? OpSize : Align);
}
Expand Down
6 changes: 6 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,12 @@
]
},

"FPR = LoadMemPredicate RegisterClass:$Class, OpSize:#Size, PRED:$Mask, GPR:$Addr": {
"Desc": [ "Loads a value to memory using SVE predicate mask." ],
"HasSideEffects": true,
"DestSize": "Size"
},

"SSA = LoadMemTSO RegisterClass:$Class, OpSize:#Size, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": {
"Desc": ["Does a x86 TSO compatible load from memory. Offset must be Invalid()."
],
Expand Down

0 comments on commit af05434

Please sign in to comment.