From c8c24c5e976a49748c0ef36f44aa543673973fef Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 19 Jun 2024 07:21:41 -0700 Subject: [PATCH] AVX128: Implement vmaskmov{ps,pd}, vpmaskmov{d,q} using SVE2 gather loadstores. --- .../Source/Interface/Core/OpcodeDispatcher.h | 9 +++ .../Core/OpcodeDispatcher/AVX_128.cpp | 60 +++++++++++++++++-- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 6aa3b6b617..2b7b01745d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1164,6 +1164,15 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VMPSADBW(OpcodeArgs); void AVX128_VPALIGNR(OpcodeArgs); + void AVX128_VMASKMOVImpl(OpcodeArgs, size_t ElementSize, size_t DstSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, + const X86Tables::DecodedOperand& DataOp); + + template + void AVX128_VPMASKMOV(OpcodeArgs); + + template + void AVX128_VMASKMOV(OpcodeArgs); + // End of AVX 128-bit implementation void InvalidOp(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 89476351a0..4534e8d542 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -288,10 +288,10 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(2, 0b01, 0x29), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(2, 0b01, 0x2A), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, {OPD(2, 0b01, 0x2B), 1, &OpDispatchBuilder::AVX128_VPACKUS<4>}, - // TODO: {OPD(2, 0b01, 0x2C), 1, &OpDispatchBuilder::VMASKMOVOp<4, false>}, - // TODO: {OPD(2, 0b01, 0x2D), 1, &OpDispatchBuilder::VMASKMOVOp<8, false>}, - // TODO: {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::VMASKMOVOp<4, true>}, - // TODO: {OPD(2, 0b01, 0x2F), 1, &OpDispatchBuilder::VMASKMOVOp<8, true>}, + {OPD(2, 0b01, 0x2C), 1, &OpDispatchBuilder::AVX128_VMASKMOV}, + {OPD(2, 0b01, 0x2D), 1, &OpDispatchBuilder::AVX128_VMASKMOV}, + {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::AVX128_VMASKMOV}, + {OPD(2, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_VMASKMOV}, {OPD(2, 0b01, 0x30), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 2, false>}, {OPD(2, 0b01, 0x31), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 4, false>}, @@ -324,8 +324,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(2, 0b01, 0x78), 1, &OpDispatchBuilder::AVX128_VBROADCAST<1>}, {OPD(2, 0b01, 0x79), 1, &OpDispatchBuilder::AVX128_VBROADCAST<2>}, - // TODO: {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp}, - // TODO: {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp}, + {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::AVX128_VPMASKMOV}, + {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::AVX128_VPMASKMOV}, {OPD(2, 0b01, 0xDB), 1, &OpDispatchBuilder::AVX128_VAESImc}, {OPD(2, 0b01, 0xDC), 1, &OpDispatchBuilder::AVX128_VAESEnc}, @@ -2552,4 +2552,52 @@ void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) { }); } +void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, size_t ElementSize, size_t DstSize, bool IsStore, + const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) { + const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + + auto Mask = AVX128_LoadSource_WithOpSize(Op, MaskOp, Op->Flags, !Is128Bit); + + const auto MakeAddress = [this, Op](const X86Tables::DecodedOperand& Data) { + return MakeSegmentAddress(Op, Data, CTX->GetGPRSize()); + }; + + ///< TODO: Needs SVE for masked loadstores. + if (IsStore) { + auto Address = MakeAddress(Op->Dest); + + auto Data = AVX128_LoadSource_WithOpSize(Op, DataOp, Op->Flags, !Is128Bit); + _VStoreVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Data.Low, Address, Invalid(), MEM_OFFSET_SXTX, 1); + if (!Is128Bit) { + ///< TODO: This can be cleaner if AVX128_LoadSource_WithOpSize could return both constructed addresses. + auto AddressHigh = _Add(OpSize::i64Bit, Address, _Constant(16)); + _VStoreVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, Data.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, 1); + } + } else { + auto Address = MakeAddress(DataOp); + + RefPair Result {}; + Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MEM_OFFSET_SXTX, 1); + + if (Is128Bit) { + Result.High = LoadZeroVector(OpSize::i128Bit); + } else { + ///< TODO: This can be cleaner if AVX128_LoadSource_WithOpSize could return both constructed addresses. + auto AddressHigh = _Add(OpSize::i64Bit, Address, _Constant(16)); + Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, AddressHigh, Invalid(), MEM_OFFSET_SXTX, 1); + } + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); + } +} + +template +void OpDispatchBuilder::AVX128_VPMASKMOV(OpcodeArgs) { + AVX128_VMASKMOVImpl(Op, GetSrcSize(Op), GetDstSize(Op), IsStore, Op->Src[0], Op->Src[1]); +} + +template +void OpDispatchBuilder::AVX128_VMASKMOV(OpcodeArgs) { + AVX128_VMASKMOVImpl(Op, ElementSize, GetDstSize(Op), IsStore, Op->Src[0], Op->Src[1]); +} + } // namespace FEXCore::IR