From 0c6c4cd53219d293534906842e087e986b2a3635 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 21 Jun 2024 15:00:06 -0400 Subject: [PATCH 01/13] OpcodeDispatcher: make FCMP more compact I told Ryan to change this for AVX, but it needs to be changed in the original to match! Signed-off-by: Alyssa Rosenzweig --- .../Core/OpcodeDispatcher/Vector.cpp | 113 +++++------------- 1 file changed, 27 insertions(+), 86 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index a969f90086..23bf03897a 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -656,56 +656,31 @@ Ref OpDispatchBuilder::InsertScalarFCMPOpImpl(OpcodeArgs, size_t DstSize, size_t Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - switch (CompType) { - case 0x00: - case 0x08: - case 0x10: - case 0x18: // EQ + switch (CompType & 7) { + case 0x0: // EQ return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::EQ, ZeroUpperBits); - case 0x01: - case 0x09: - case 0x11: - case 0x19: // LT, GT(Swapped operand) + case 0x1: // LT, GT(Swapped operand) return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::LT, ZeroUpperBits); - case 0x02: - case 0x0A: - case 0x12: - case 0x1A: // LE, GE(Swapped operand) + case 0x2: // LE, GE(Swapped operand) return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::LE, ZeroUpperBits); - case 0x03: - case 0x0B: - case 0x13: - case 0x1B: // Unordered + case 0x3: // Unordered return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::UNO, ZeroUpperBits); - case 0x04: - case 0x0C: - case 0x14: - case 0x1C: // NEQ + case 0x4: // NEQ return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::NEQ, ZeroUpperBits); - case 0x05: - case 0x0D: - case 0x15: - case 0x1D: { // NLT, NGT(Swapped operand) + case 0x5: { // NLT, NGT(Swapped operand) Ref Result = _VFCMPLT(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(GetDstSize(Op), ElementSize, 0, 0, Src1, Result); } - case 0x06: - case 0x0E: - case 0x16: - case 0x1E: { // NLE, NGE(Swapped operand) + case 0x6: { // NLE, NGE(Swapped operand) Ref Result = _VFCMPLE(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(GetDstSize(Op), ElementSize, 0, 0, Src1, Result); } - case 0x07: - case 0x0F: - case 0x17: - case 0x1F: // Ordered + case 0x7: // Ordered return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::ORD, ZeroUpperBits); - default: LOGMAN_MSG_A_FMT("Unknown Comparison type: {}", CompType); break; } FEX_UNREACHABLE; } @@ -2522,61 +2497,27 @@ Ref OpDispatchBuilder::VFCMPOpImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref const auto Size = GetSrcSize(Op); Ref Result {}; - switch (CompType) { - case 0x00: - case 0x08: - case 0x10: - case 0x18: // EQ - Result = _VFCMPEQ(Size, ElementSize, Src1, Src2); - break; - case 0x01: - case 0x09: - case 0x11: - case 0x19: // LT, GT(Swapped operand) + switch (CompType & 0x7) { + case 0x0: // EQ + return _VFCMPEQ(Size, ElementSize, Src1, Src2); + case 0x1: // LT, GT(Swapped operand) + return _VFCMPLT(Size, ElementSize, Src1, Src2); + case 0x2: // LE, GE(Swapped operand) + return _VFCMPLE(Size, ElementSize, Src1, Src2); + case 0x3: // Unordered + return _VFCMPUNO(Size, ElementSize, Src1, Src2); + case 0x4: // NEQ + return _VFCMPNEQ(Size, ElementSize, Src1, Src2); + case 0x5: // NLT, NGT(Swapped operand) Result = _VFCMPLT(Size, ElementSize, Src1, Src2); - break; - case 0x02: - case 0x0A: - case 0x12: - case 0x1A: // LE, GE(Swapped operand) + return _VNot(Size, ElementSize, Result); + case 0x6: // NLE, NGE(Swapped operand) Result = _VFCMPLE(Size, ElementSize, Src1, Src2); - break; - case 0x03: - case 0x0B: - case 0x13: - case 0x1B: // Unordered - Result = _VFCMPUNO(Size, ElementSize, Src1, Src2); - break; - case 0x04: - case 0x0C: - case 0x14: - case 0x1C: // NEQ - Result = _VFCMPNEQ(Size, ElementSize, Src1, Src2); - break; - case 0x05: - case 0x0D: - case 0x15: - case 0x1D: // NLT, NGT(Swapped operand) - Result = _VFCMPLT(Size, ElementSize, Src1, Src2); - Result = _VNot(Size, ElementSize, Result); - break; - case 0x06: - case 0x0E: - case 0x16: - case 0x1E: // NLE, NGE(Swapped operand) - Result = _VFCMPLE(Size, ElementSize, Src1, Src2); - Result = _VNot(Size, ElementSize, Result); - break; - case 0x07: - case 0x0F: - case 0x17: - case 0x1F: // Ordered - Result = _VFCMPORD(Size, ElementSize, Src1, Src2); - break; - default: LOGMAN_MSG_A_FMT("Unknown Comparison type: {}", CompType); break; + return _VNot(Size, ElementSize, Result); + case 0x7: // Ordered + return _VFCMPORD(Size, ElementSize, Src1, Src2); } - - return Result; + FEX_UNREACHABLE; } template From cd03932bd167d2c82c9ccfb5e0165778fa11de3a Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 21 Jun 2024 15:16:59 -0400 Subject: [PATCH 02/13] OpcodeDispatcher: tweak InsertScalarFCMPOpImpl signature so AVX128 can reuse it. Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/Core/OpcodeDispatcher.h | 3 +- .../Core/OpcodeDispatcher/Vector.cpp | 43 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index fa9fab6e98..f1a6c43a00 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1225,8 +1225,7 @@ class OpDispatchBuilder final : public IREmitter { Ref InsertScalarRoundImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, uint64_t Mode, bool ZeroUpperBits); - Ref InsertScalarFCMPOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, - const X86Tables::DecodedOperand& Src2Op, uint8_t CompType, bool ZeroUpperBits); + Ref InsertScalarFCMPOpImpl(OpSize Size, uint8_t OpDstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType, bool ZeroUpperBits); Ref VectorRoundImpl(OpcodeArgs, size_t ElementSize, Ref Src, uint64_t Mode); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 23bf03897a..840db9d68e 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -646,41 +646,33 @@ template void OpDispatchBuilder::AVXInsertScalarRound<4>(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalarRound<8>(OpcodeArgs); -Ref OpDispatchBuilder::InsertScalarFCMPOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, - const X86Tables::DecodedOperand& Src2Op, uint8_t CompType, bool ZeroUpperBits) { - // We load the full vector width when dealing with a source vector, - // so that we don't do any unnecessary zero extension to the scalar - // element that we're going to operate on. - const auto SrcSize = GetSrcSize(Op); - - Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Src1Op, DstSize, Op->Flags); - Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - +Ref OpDispatchBuilder::InsertScalarFCMPOpImpl(OpSize Size, uint8_t OpDstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType, + bool ZeroUpperBits) { switch (CompType & 7) { case 0x0: // EQ - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::EQ, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::EQ, ZeroUpperBits); case 0x1: // LT, GT(Swapped operand) - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::LT, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::LT, ZeroUpperBits); case 0x2: // LE, GE(Swapped operand) - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::LE, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::LE, ZeroUpperBits); case 0x3: // Unordered - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::UNO, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::UNO, ZeroUpperBits); case 0x4: // NEQ - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::NEQ, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::NEQ, ZeroUpperBits); case 0x5: { // NLT, NGT(Swapped operand) Ref Result = _VFCMPLT(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits - return _VInsElement(GetDstSize(Op), ElementSize, 0, 0, Src1, Result); + return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case 0x6: { // NLE, NGE(Swapped operand) Ref Result = _VFCMPLE(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits - return _VInsElement(GetDstSize(Op), ElementSize, 0, 0, Src1, Result); + return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case 0x7: // Ordered - return _VFCMPScalarInsert(IR::SizeToOpSize(DstSize), ElementSize, Src1, Src2, FloatCompareOp::ORD, ZeroUpperBits); + return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::ORD, ZeroUpperBits); } FEX_UNREACHABLE; } @@ -689,8 +681,12 @@ template void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs) { const uint8_t CompType = Op->Src[1].Literal(); const auto DstSize = GetGuestVectorLength(); + const auto SrcSize = GetSrcSize(Op); + + Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags); + Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - Ref Result = InsertScalarFCMPOpImpl(Op, DstSize, ElementSize, Op->Dest, Op->Src[0], CompType, false); + Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), GetDstSize(Op), ElementSize, Src1, Src2, CompType, false); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, -1); } @@ -701,8 +697,15 @@ template void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) { const uint8_t CompType = Op->Src[2].Literal(); const auto DstSize = GetGuestVectorLength(); + const auto SrcSize = GetSrcSize(Op); + + // We load the full vector width when dealing with a source vector, + // so that we don't do any unnecessary zero extension to the scalar + // element that we're going to operate on. + Ref Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags); + Ref Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); - Ref Result = InsertScalarFCMPOpImpl(Op, DstSize, ElementSize, Op->Src[0], Op->Src[1], CompType, true); + Ref Result = InsertScalarFCMPOpImpl(IR::SizeToOpSize(DstSize), GetDstSize(Op), ElementSize, Src1, Src2, CompType, true); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, -1); } From 2a6d6a9d13d602f0028542b4cb3d7ad27f0558ec Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 17 Jun 2024 22:22:56 -0700 Subject: [PATCH 03/13] AVX128: Implement support for v{u,}comis{s,d} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 ++ .../Core/OpcodeDispatcher/AVX_128.cpp | 27 ++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index f1a6c43a00..59a0f74792 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1029,6 +1029,8 @@ class OpDispatchBuilder final : public IREmitter { Ref AVX128_PSIGNImpl(size_t ElementSize, Ref Src1, Ref Src2); template void AVX128_VPSIGN(OpcodeArgs); + template + void AVX128_UCOMISx(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index caadbd3879..9e45c50218 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -69,10 +69,10 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b10, 0x2D), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<4, true>}, {OPD(1, 0b11, 0x2D), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<8, true>}, - // TODO: {OPD(1, 0b00, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp<4>}, - // TODO: {OPD(1, 0b01, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp<8>}, - // TODO: {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::UCOMISxOp<4>}, - // TODO: {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::UCOMISxOp<8>}, + {OPD(1, 0b00, 0x2E), 1, &OpDispatchBuilder::AVX128_UCOMISx<4>}, + {OPD(1, 0b01, 0x2E), 1, &OpDispatchBuilder::AVX128_UCOMISx<8>}, + {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<4>}, + {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<8>}, // TODO: {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::MOVMSKOp<4>}, // TODO: {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::MOVMSKOp<8>}, @@ -901,4 +901,23 @@ void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs) { [this](size_t _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); }); } +template +void OpDispatchBuilder::AVX128_UCOMISx(OpcodeArgs) { + const auto SrcSize = Op->Src[0].IsGPR() ? GetGuestVectorLength() : GetSrcSize(Op); + + auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, false); + + RefPair Src2 {}; + + // Careful here, if the source is from a GPR then we want to load the full 128-bit lower half. + // If it is memory then we only want to load the element size. + if (Op->Src[0].IsGPR()) { + Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); + } else { + Src2.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags); + } + + Comiss(ElementSize, Src1.Low, Src2.Low); +} + } // namespace FEXCore::IR From df232f567b4492d31d954bd8e5edcfb660d09317 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 17 Jun 2024 22:33:07 -0700 Subject: [PATCH 04/13] AVX128: Implement support for v{add,sub,mul,fmin,fmax,fdiv,sqrt,rsqrt,rcp}s{s,d} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 + .../Core/OpcodeDispatcher/AVX_128.cpp | 53 +++++++++++++------ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 59a0f74792..3dbf156e5b 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1031,6 +1031,8 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VPSIGN(OpcodeArgs); template void AVX128_UCOMISx(OpcodeArgs); + template + void AVX128_VectorScalarInsertALU(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 9e45c50218..a67d1c11d1 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -79,14 +79,14 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::AVX128_VectorUnary}, {OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::AVX128_VectorUnary}, - // TODO: {OPD(1, 0b10, 0x51), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x51), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, + {OPD(1, 0b10, 0x51), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x51), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x52), 1, &OpDispatchBuilder::AVX128_VectorUnary}, - // TODO: {OPD(1, 0b10, 0x52), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, + {OPD(1, 0b10, 0x52), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x53), 1, &OpDispatchBuilder::AVX128_VectorUnary}, - // TODO: {OPD(1, 0b10, 0x53), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, + {OPD(1, 0b10, 0x53), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x54), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x54), 1, &OpDispatchBuilder::AVX128_VectorALU}, @@ -102,13 +102,13 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0x58), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x58), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x58), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x58), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x58), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x58), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x59), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x59), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x59), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x59), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, // TODO: {OPD(1, 0b00, 0x5A), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Float<8, 4>}, // TODO: {OPD(1, 0b01, 0x5A), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Float<4, 8>}, @@ -121,23 +121,23 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0x5C), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x5C), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x5C), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x5C), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x5C), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x5C), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x5D), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x5D), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x5D), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x5D), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x5D), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x5D), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x5E), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x5E), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x5E), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x5E), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x5E), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x5E), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b00, 0x5F), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0x5F), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b10, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, - // TODO: {OPD(1, 0b11, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, + {OPD(1, 0b10, 0x5F), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, + {OPD(1, 0b11, 0x5F), 1, &OpDispatchBuilder::AVX128_VectorScalarInsertALU}, {OPD(1, 0b01, 0x60), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<1>}, {OPD(1, 0b01, 0x61), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<2>}, @@ -920,4 +920,25 @@ void OpDispatchBuilder::AVX128_UCOMISx(OpcodeArgs) { Comiss(ElementSize, Src1.Low, Src2.Low); } +template +void OpDispatchBuilder::AVX128_VectorScalarInsertALU(OpcodeArgs) { + // We load the full vector width when dealing with a source vector, + // so that we don't do any unnecessary zero extension to the scalar + // element that we're going to operate on. + const auto SrcSize = GetSrcSize(Op); + + auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); + RefPair Src2 {}; + if (Op->Src[1].IsGPR()) { + Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); + } else { + Src2.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags); + } + + // If OpSize == ElementSize then it only does the lower scalar op + DeriveOp(Result_Low, IROp, _VFAddScalarInsert(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, false)); + auto High = LoadZeroVector(OpSize::i128Bit); + AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = High}); +} + } // namespace FEXCore::IR From a0ced2b685de8c5c4c27e6118967a35d08c3874f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 17 Jun 2024 22:55:42 -0700 Subject: [PATCH 05/13] AVX128: Implement support for vcmpp{s,d} --- FEXCore/Source/Interface/Core/OpcodeDispatcher.h | 3 +++ .../Interface/Core/OpcodeDispatcher/AVX_128.cpp | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 3dbf156e5b..032937d83d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1033,6 +1033,9 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_UCOMISx(OpcodeArgs); template void AVX128_VectorScalarInsertALU(OpcodeArgs); + Ref AVX128_VFCMPImpl(size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType); + template + void AVX128_VFCMP(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index a67d1c11d1..def9054b2f 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -179,8 +179,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, - // TODO: {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp<4>}, - // TODO: {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp<8>}, + {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<4>}, + {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<8>}, // TODO: {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<4>}, // TODO: {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<8>}, @@ -941,4 +941,13 @@ void OpDispatchBuilder::AVX128_VectorScalarInsertALU(OpcodeArgs) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = High}); } +template +void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) { + const uint8_t CompType = Op->Src[2].Literal(); + + AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize, [this, Op, CompType](size_t _ElementSize, Ref Src1, Ref Src2) { + return VFCMPOpImpl(Op, _ElementSize, Src1, Src2, CompType); + }); +} + } // namespace FEXCore::IR From 9566dda73e9a2a896ce845d3f8cea22e800c8ef1 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 17 Jun 2024 23:05:30 -0700 Subject: [PATCH 06/13] AVX128: Implement support for vcmps{s,d} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 ++ .../Core/OpcodeDispatcher/AVX_128.cpp | 28 +++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 032937d83d..46d346565a 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1036,6 +1036,8 @@ class OpDispatchBuilder final : public IREmitter { Ref AVX128_VFCMPImpl(size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType); template void AVX128_VFCMP(OpcodeArgs); + template + void AVX128_InsertScalarFCMP(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index def9054b2f..116855c323 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -181,8 +181,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<4>}, {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<8>}, - // TODO: {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<4>}, - // TODO: {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<8>}, + {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<4>}, + {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<8>}, // TODO: {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::VPINSRWOp}, // TODO: {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::PExtrOp<2>}, @@ -950,4 +950,28 @@ void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) { }); } +template +void OpDispatchBuilder::AVX128_InsertScalarFCMP(OpcodeArgs) { + // We load the full vector width when dealing with a source vector, + // so that we don't do any unnecessary zero extension to the scalar + // element that we're going to operate on. + const auto SrcSize = GetSrcSize(Op); + + auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); + RefPair Src2 {}; + + if (Op->Src[1].IsGPR()) { + Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); + } else { + Src2.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags); + } + + const uint8_t CompType = Op->Src[2].Literal(); + + RefPair Result {}; + Result.Low = InsertScalarFCMPOpImpl(OpSize::i128Bit, OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, CompType, false); + Result.High = LoadZeroVector(OpSize::i128Bit); + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + } // namespace FEXCore::IR From 3045578da4e91c92671dcdef05d6e8acb08a1490 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 00:39:47 -0700 Subject: [PATCH 07/13] AVX128: Implement vmov{d,q} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 + .../Core/OpcodeDispatcher/AVX_128.cpp | 38 ++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 46d346565a..50ef9f9db6 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1039,6 +1039,8 @@ class OpDispatchBuilder final : public IREmitter { template void AVX128_InsertScalarFCMP(OpcodeArgs); + void AVX128_MOVBetweenGPR_FPR(OpcodeArgs); + // End of AVX 128-bit implementation void InvalidOp(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 116855c323..8cff3097b9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -153,7 +153,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::AVX128_VPACKSS<4>}, {OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<8>}, {OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<8>}, - // TODO: {OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR}, + {OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR}, {OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, @@ -173,7 +173,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { // TODO: {OPD(1, 0b01, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<8>}, // TODO: {OPD(1, 0b11, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<4>}, - // TODO: {OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR}, + {OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR}, {OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::AVX128_MOVQ}, {OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, @@ -974,4 +974,38 @@ void OpDispatchBuilder::AVX128_InsertScalarFCMP(OpcodeArgs) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } +void OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR(OpcodeArgs) { + if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= FEXCore::X86State::REG_XMM_0) { + ///< XMM <- Reg/Mem + + RefPair Result {}; + if (Op->Src[0].IsGPR()) { + // Loading from GPR and moving to Vector. + Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], CTX->GetGPRSize(), Op->Flags); + // zext to 128bit + Result.Low = _VCastFromGPR(OpSize::i128Bit, GetSrcSize(Op), Src); + } else { + // Loading from Memory as a scalar. Zero extend + Result.Low = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); + } + + Result.High = LoadZeroVector(OpSize::i128Bit); + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); + } else { + ///< Reg/Mem <- XMM + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); + + if (Op->Dest.IsGPR()) { + auto ElementSize = GetDstSize(Op); + // Extract element from GPR. Zero extending in the process. + Src.Low = _VExtractToGPR(GetSrcSize(Op), ElementSize, Src.Low, 0); + StoreResult(GPRClass, Op, Op->Dest, Src.Low, -1); + } else { + // Storing first element to memory. + Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); + _StoreMem(FPRClass, GetDstSize(Op), Dest, Src.Low, 1); + } + } +} + } // namespace FEXCore::IR From d1dd055e6a0ba2acfa7e96cf23de7cf050850d43 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 00:46:38 -0700 Subject: [PATCH 08/13] AVX128: Implement support for vpextr{b,w,d,q} --- .../Source/Interface/Core/OpcodeDispatcher.h | 3 +- .../Core/OpcodeDispatcher/AVX_128.cpp | 43 ++++++++++++++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 50ef9f9db6..83cd3ac109 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1038,8 +1038,9 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VFCMP(OpcodeArgs); template void AVX128_InsertScalarFCMP(OpcodeArgs); - void AVX128_MOVBetweenGPR_FPR(OpcodeArgs); + template + void AVX128_PExtr(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 8cff3097b9..5303875467 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -185,7 +185,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<8>}, // TODO: {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::VPINSRWOp}, - // TODO: {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::PExtrOp<2>}, + {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::AVX128_PExtr<2>}, // TODO: {OPD(1, 0b00, 0xC6), 1, &OpDispatchBuilder::VSHUFOp<4>}, // TODO: {OPD(1, 0b01, 0xC6), 1, &OpDispatchBuilder::VSHUFOp<8>}, @@ -348,10 +348,10 @@ void OpDispatchBuilder::InstallAVX128Handlers() { // TODO: {OPD(3, 0b01, 0x0E), 1, &OpDispatchBuilder::VPBLENDWOp}, // TODO: {OPD(3, 0b01, 0x0F), 1, &OpDispatchBuilder::VPALIGNROp}, - // TODO: {OPD(3, 0b01, 0x14), 1, &OpDispatchBuilder::PExtrOp<1>}, - // TODO: {OPD(3, 0b01, 0x15), 1, &OpDispatchBuilder::PExtrOp<2>}, - // TODO: {OPD(3, 0b01, 0x16), 1, &OpDispatchBuilder::PExtrOp<4>}, - // TODO: {OPD(3, 0b01, 0x17), 1, &OpDispatchBuilder::PExtrOp<4>}, + {OPD(3, 0b01, 0x14), 1, &OpDispatchBuilder::AVX128_PExtr<1>}, + {OPD(3, 0b01, 0x15), 1, &OpDispatchBuilder::AVX128_PExtr<2>}, + {OPD(3, 0b01, 0x16), 1, &OpDispatchBuilder::AVX128_PExtr<4>}, + {OPD(3, 0b01, 0x17), 1, &OpDispatchBuilder::AVX128_PExtr<4>}, // TODO: {OPD(3, 0b01, 0x18), 1, &OpDispatchBuilder::VINSERTOp}, // TODO: {OPD(3, 0b01, 0x19), 1, &OpDispatchBuilder::VEXTRACT128Op}, @@ -1008,4 +1008,37 @@ void OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR(OpcodeArgs) { } } +template +void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) { + const auto DstSize = GetDstSize(Op); + + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); + uint64_t Index = Op->Src[1].Literal(); + + // Fixup of 32-bit element size. + // When the element size is 32-bit then it can be overriden as 64-bit because the encoding of PEXTRD/PEXTRQ + // is the same except that REX.W or VEX.W is set to 1. Incredibly frustrating. + // Use the destination size as the element size in this case. + size_t OverridenElementSize = ElementSize; + if constexpr (ElementSize == 4) { + OverridenElementSize = DstSize; + } + + // AVX version only operates on 128-bit. + const uint8_t NumElements = std::min(GetSrcSize(Op), 16) / OverridenElementSize; + Index &= NumElements - 1; + + if (Op->Dest.IsGPR()) { + const uint8_t GPRSize = CTX->GetGPRSize(); + // Extract already zero extends the result. + Ref Result = _VExtractToGPR(OpSize::i128Bit, OverridenElementSize, Src.Low, Index); + StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, -1); + return; + } + + // If we are storing to memory then we store the size of the element extracted + Ref Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); + _VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src.Low, Index, Dest); +} + } // namespace FEXCore::IR From 28d679de98e67944da0200c1c712b72233621ac0 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 01:03:33 -0700 Subject: [PATCH 09/13] AVX128: Implement support for vpmov{s,z}{b,w,d}{w,d,q} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 + .../Core/OpcodeDispatcher/AVX_128.cpp | 77 ++++++++++++++++--- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 83cd3ac109..40ac2241c8 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1041,6 +1041,8 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_MOVBetweenGPR_FPR(OpcodeArgs); template void AVX128_PExtr(OpcodeArgs); + template + void AVX128_ExtendVectorElements(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 5303875467..b37ac17dca 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -277,12 +277,12 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(2, 0b01, 0x1D), 1, &OpDispatchBuilder::AVX128_VectorUnary}, {OPD(2, 0b01, 0x1E), 1, &OpDispatchBuilder::AVX128_VectorUnary}, - // TODO: {OPD(2, 0b01, 0x20), 1, &OpDispatchBuilder::ExtendVectorElements<1, 2, true>}, - // TODO: {OPD(2, 0b01, 0x21), 1, &OpDispatchBuilder::ExtendVectorElements<1, 4, true>}, - // TODO: {OPD(2, 0b01, 0x22), 1, &OpDispatchBuilder::ExtendVectorElements<1, 8, true>}, - // TODO: {OPD(2, 0b01, 0x23), 1, &OpDispatchBuilder::ExtendVectorElements<2, 4, true>}, - // TODO: {OPD(2, 0b01, 0x24), 1, &OpDispatchBuilder::ExtendVectorElements<2, 8, true>}, - // TODO: {OPD(2, 0b01, 0x25), 1, &OpDispatchBuilder::ExtendVectorElements<4, 8, true>}, + {OPD(2, 0b01, 0x20), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 2, true>}, + {OPD(2, 0b01, 0x21), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 4, true>}, + {OPD(2, 0b01, 0x22), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 8, true>}, + {OPD(2, 0b01, 0x23), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<2, 4, true>}, + {OPD(2, 0b01, 0x24), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<2, 8, true>}, + {OPD(2, 0b01, 0x25), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<4, 8, true>}, // TODO: {OPD(2, 0b01, 0x28), 1, &OpDispatchBuilder::VPMULLOp<4, true>}, {OPD(2, 0b01, 0x29), 1, &OpDispatchBuilder::AVX128_VectorALU}, @@ -293,12 +293,12 @@ void OpDispatchBuilder::InstallAVX128Handlers() { // TODO: {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::VMASKMOVOp<4, true>}, // TODO: {OPD(2, 0b01, 0x2F), 1, &OpDispatchBuilder::VMASKMOVOp<8, true>}, - // TODO: {OPD(2, 0b01, 0x30), 1, &OpDispatchBuilder::ExtendVectorElements<1, 2, false>}, - // TODO: {OPD(2, 0b01, 0x31), 1, &OpDispatchBuilder::ExtendVectorElements<1, 4, false>}, - // TODO: {OPD(2, 0b01, 0x32), 1, &OpDispatchBuilder::ExtendVectorElements<1, 8, false>}, - // TODO: {OPD(2, 0b01, 0x33), 1, &OpDispatchBuilder::ExtendVectorElements<2, 4, false>}, - // TODO: {OPD(2, 0b01, 0x34), 1, &OpDispatchBuilder::ExtendVectorElements<2, 8, false>}, - // TODO: {OPD(2, 0b01, 0x35), 1, &OpDispatchBuilder::ExtendVectorElements<4, 8, false>}, + {OPD(2, 0b01, 0x30), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 2, false>}, + {OPD(2, 0b01, 0x31), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 4, false>}, + {OPD(2, 0b01, 0x32), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<1, 8, false>}, + {OPD(2, 0b01, 0x33), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<2, 4, false>}, + {OPD(2, 0b01, 0x34), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<2, 8, false>}, + {OPD(2, 0b01, 0x35), 1, &OpDispatchBuilder::AVX128_ExtendVectorElements<4, 8, false>}, // TODO: {OPD(2, 0b01, 0x36), 1, &OpDispatchBuilder::VPERMDOp}, {OPD(2, 0b01, 0x37), 1, &OpDispatchBuilder::AVX128_VectorALU}, @@ -1041,4 +1041,57 @@ void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs) { _VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src.Low, Index, Dest); } +template +void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs) { + const auto DstSize = GetDstSize(Op); + + const auto GetSrc = [&] { + if (Op->Src[0].IsGPR()) { + return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low; + } else { + // For memory operands the 256-bit variant loads twice the size specified in the table. + const auto Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto SrcSize = GetSrcSize(Op); + const auto LoadSize = Is256Bit ? SrcSize * 2 : SrcSize; + + return LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], LoadSize, Op->Flags); + } + }; + + auto Transform = [this](Ref Src) { + for (size_t CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize <<= 1) { + if (Signed) { + Src = _VSXTL(OpSize::i128Bit, CurrentElementSize, Src); + } else { + Src = _VUXTL(OpSize::i128Bit, CurrentElementSize, Src); + } + } + return Src; + }; + + Ref Src = GetSrc(); + RefPair Result {}; + + if (DstSize == OpSize::i128Bit) { + // 128-bit operation is easy, it stays within the single register. + Result.Low = Transform(Src); + } else { + // 256-bit operation is a bit special. It splits the incoming source between lower and upper registers. + size_t TotalElementCount = OpSize::i256Bit / DstElementSize; + size_t TotalElementsToSplitSize = (TotalElementCount / 2) * ElementSize; + + // Split the number of elements in half between lower and upper. + Ref SrcHigh = _VDupElement(OpSize::i128Bit, TotalElementsToSplitSize, Src, 1); + Result.Low = Transform(Src); + Result.High = Transform(SrcHigh); + } + + if (DstSize == OpSize::i128Bit) { + // Regular zero-extending semantics. + Result.High = LoadZeroVector(OpSize::i128Bit); + } + + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + } // namespace FEXCore::IR From b58a57d2250962004cb90fa36240bb49bcd652f0 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 02:49:40 -0700 Subject: [PATCH 10/13] AVX128: Implement support for vmovmskp{s,d} --- .../Source/Interface/Core/OpcodeDispatcher.h | 2 + .../Core/OpcodeDispatcher/AVX_128.cpp | 55 ++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 40ac2241c8..a8ed294a00 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1043,6 +1043,8 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_PExtr(OpcodeArgs); template void AVX128_ExtendVectorElements(OpcodeArgs); + template + void AVX128_MOVMSK(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index b37ac17dca..625a0aa5b5 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -74,8 +74,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<4>}, {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::AVX128_UCOMISx<8>}, - // TODO: {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::MOVMSKOp<4>}, - // TODO: {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::MOVMSKOp<8>}, + {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<4>}, + {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::AVX128_MOVMSK<8>}, {OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::AVX128_VectorUnary}, {OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::AVX128_VectorUnary}, @@ -1094,4 +1094,55 @@ void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } +template +void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs) { + const auto SrcSize = GetSrcSize(Op); + const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); + + auto Mask8Byte = [this](Ref Src) { + // UnZip2 the 64-bit elements as 32-bit to get the sign bits closer. + // Sign bits are now in bit positions 31 and 63 after this. + Src = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); + + // Extract the low 64-bits to GPR in one move. + Ref GPR = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 0); + // BFI the sign bit in 31 in to 62. + // Inserting the full lower 32-bits offset 31 so the sign bit ends up at offset 63. + GPR = _Bfi(OpSize::i64Bit, 32, 31, GPR, GPR); + // Shift right to only get the two sign bits we care about. + return _Lshr(OpSize::i64Bit, GPR, _Constant(62)); + }; + + auto Mask4Byte = [this](Ref Src) { + // Shift all the sign bits to the bottom of their respective elements. + Src = _VUShrI(OpSize::i128Bit, OpSize::i32Bit, Src, 31); + // Load the specific 128-bit movmskps shift elements operator. + auto ConstantUSHL = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_MOVMSKPS_SHIFT); + // Shift the sign bits in to specific locations. + Src = _VUShl(OpSize::i128Bit, OpSize::i32Bit, Src, ConstantUSHL, false); + // Add across the vector so the sign bits will end up in bits [3:0] + Src = _VAddV(OpSize::i128Bit, OpSize::i32Bit, Src); + // Extract to a GPR. + return _VExtractToGPR(OpSize::i128Bit, OpSize::i32Bit, Src, 0); + }; + + Ref GPR {}; + if (SrcSize == 16 && ElementSize == 8) { + GPR = Mask8Byte(Src.Low); + } else if (SrcSize == 16 && ElementSize == 4) { + GPR = Mask4Byte(Src.Low); + } else if (ElementSize == 4) { + auto GPRLow = Mask4Byte(Src.Low); + auto GPRHigh = Mask4Byte(Src.High); + GPR = _Orlshl(OpSize::i64Bit, GPRLow, GPRHigh, 4); + } else { + auto GPRLow = Mask8Byte(Src.Low); + auto GPRHigh = Mask8Byte(Src.High); + GPR = _Orlshl(OpSize::i64Bit, GPRLow, GPRHigh, 2); + } + StoreResult_WithOpSize(GPRClass, Op, Op->Dest, GPR, CTX->GetGPRSize(), -1); +} + } // namespace FEXCore::IR From ad122cf463a71d9f859bf894daabc67e63447fd1 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 02:58:55 -0700 Subject: [PATCH 11/13] AVX128: Implement support for vpmovmskb --- .../Source/Interface/Core/OpcodeDispatcher.h | 1 + .../Core/OpcodeDispatcher/AVX_128.cpp | 31 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index a8ed294a00..bfb7d87f2a 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1045,6 +1045,7 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_ExtendVectorElements(OpcodeArgs); template void AVX128_MOVMSK(OpcodeArgs); + void AVX128_MOVMSKB(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 625a0aa5b5..083acb41dd 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -199,7 +199,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::AVX128_MOVQ}, - // TODO: {OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::MOVMSKOpOne}, + {OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::AVX128_MOVMSKB}, {OPD(1, 0b01, 0xD8), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0xD9), 1, &OpDispatchBuilder::AVX128_VectorALU}, @@ -1145,4 +1145,33 @@ void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs) { StoreResult_WithOpSize(GPRClass, Op, Op->Dest, GPR, CTX->GetGPRSize(), -1); } +void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) { + const auto SrcSize = GetSrcSize(Op); + const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; + + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); + Ref VMask = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_MOVMASKB); + + auto Mask1Byte = [this](Ref Src, Ref VMask) { + auto VCMP = _VCMPLTZ(OpSize::i128Bit, OpSize::i8Bit, Src); + auto VAnd = _VAnd(OpSize::i128Bit, OpSize::i8Bit, VCMP, VMask); + + auto VAdd1 = _VAddP(OpSize::i128Bit, OpSize::i8Bit, VAnd, VAnd); + auto VAdd2 = _VAddP(OpSize::i128Bit, OpSize::i8Bit, VAdd1, VAdd1); + auto VAdd3 = _VAddP(OpSize::i64Bit, OpSize::i8Bit, VAdd2, VAdd2); + + ///< 16-bits of data per 128-bit + return _VExtractToGPR(OpSize::i128Bit, 2, VAdd3, 0); + }; + + Ref Result = Mask1Byte(Src.Low, VMask); + + if (!Is128Bit) { + auto ResultHigh = Mask1Byte(Src.High, VMask); + Result = _Orlshl(OpSize::i64Bit, Result, ResultHigh, 16); + } + + StoreResult(GPRClass, Op, Result, -1); +} + } // namespace FEXCore::IR From abdcaa7c86e6d3938a0ece29fceef69d3f42c983 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 03:09:54 -0700 Subject: [PATCH 12/13] AVX128: Implement support for vpinsr{b,w,d,q} --- .../Source/Interface/Core/OpcodeDispatcher.h | 5 +++ .../Core/OpcodeDispatcher/AVX_128.cpp | 41 +++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index bfb7d87f2a..952902b197 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -1046,6 +1046,11 @@ class OpDispatchBuilder final : public IREmitter { template void AVX128_MOVMSK(OpcodeArgs); void AVX128_MOVMSKB(OpcodeArgs); + void AVX128_PINSRImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, + const X86Tables::DecodedOperand& Imm); + void AVX128_VPINSRB(OpcodeArgs); + void AVX128_VPINSRW(OpcodeArgs); + void AVX128_VPINSRDQ(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 083acb41dd..ab246e289f 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -184,7 +184,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<4>}, {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<8>}, - // TODO: {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::VPINSRWOp}, + {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::AVX128_VPINSRW}, {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::AVX128_PExtr<2>}, // TODO: {OPD(1, 0b00, 0xC6), 1, &OpDispatchBuilder::VSHUFOp<4>}, @@ -355,9 +355,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() { // TODO: {OPD(3, 0b01, 0x18), 1, &OpDispatchBuilder::VINSERTOp}, // TODO: {OPD(3, 0b01, 0x19), 1, &OpDispatchBuilder::VEXTRACT128Op}, - // TODO: {OPD(3, 0b01, 0x20), 1, &OpDispatchBuilder::VPINSRBOp}, + {OPD(3, 0b01, 0x20), 1, &OpDispatchBuilder::AVX128_VPINSRB}, // TODO: {OPD(3, 0b01, 0x21), 1, &OpDispatchBuilder::VINSERTPSOp}, - // TODO: {OPD(3, 0b01, 0x22), 1, &OpDispatchBuilder::VPINSRDQOp}, + {OPD(3, 0b01, 0x22), 1, &OpDispatchBuilder::AVX128_VPINSRDQ}, // TODO: {OPD(3, 0b01, 0x38), 1, &OpDispatchBuilder::VINSERTOp}, // TODO: {OPD(3, 0b01, 0x39), 1, &OpDispatchBuilder::VEXTRACT128Op}, @@ -1174,4 +1174,39 @@ void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) { StoreResult(GPRClass, Op, Result, -1); } +void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, + const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) { + const auto NumElements = OpSize::i128Bit / ElementSize; + const uint64_t Index = Imm.Literal() & (NumElements - 1); + auto Src1 = AVX128_LoadSource_WithOpSize(Op, Src1Op, Op->Flags, false); + + RefPair Result {}; + + if (Src2Op.IsGPR()) { + // If the source is a GPR then convert directly from the GPR. + auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, CTX->GetGPRSize(), Op->Flags); + Result.Low = _VInsGPR(OpSize::i128Bit, ElementSize, Index, Src1.Low, Src2); + } else { + // If loading from memory then we only load the element size + auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Src2Op, ElementSize, Op->Flags, {.LoadData = false}); + Result.Low = _VLoadVectorElement(OpSize::i128Bit, ElementSize, Src1.Low, Index, Src2); + } + + Result.High = LoadZeroVector(OpSize::i128Bit); + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + +void OpDispatchBuilder::AVX128_VPINSRB(OpcodeArgs) { + AVX128_PINSRImpl(Op, 1, Op->Src[0], Op->Src[1], Op->Src[2]); +} + +void OpDispatchBuilder::AVX128_VPINSRW(OpcodeArgs) { + AVX128_PINSRImpl(Op, 2, Op->Src[0], Op->Src[1], Op->Src[2]); +} + +void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) { + const auto SrcSize = GetSrcSize(Op); + AVX128_PINSRImpl(Op, SrcSize, Op->Src[0], Op->Src[1], Op->Src[2]); +} + } // namespace FEXCore::IR From 9c531d97b07902a29d627cbb84ac489c3b6be4ba Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 18 Jun 2024 04:15:23 -0700 Subject: [PATCH 13/13] AVX128: Implements the various vector shift instructions These are very closely related to each other so it makes sense to implement the roughly three different families in one commit. --- .../Source/Interface/Core/OpcodeDispatcher.h | 26 +++ .../Core/OpcodeDispatcher/AVX_128.cpp | 183 ++++++++++++++++-- 2 files changed, 188 insertions(+), 21 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 952902b197..79843b77dd 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -992,6 +992,11 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VectorUnaryImpl(OpcodeArgs, IROps IROp, size_t ElementSize); void AVX128_VectorUnaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, std::function Helper); void AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, size_t ElementSize, std::function Helper); + void AVX128_VectorShiftWideImpl(OpcodeArgs, size_t ElementSize, IROps IROp); + void AVX128_VectorShiftImmImpl(OpcodeArgs, size_t ElementSize, IROps IROp); + + enum class ShiftDirection { RIGHT, LEFT }; + void AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir); void AVX128_VMOVAPS(OpcodeArgs); void AVX128_VMOVSD(OpcodeArgs); @@ -1051,6 +1056,27 @@ class OpDispatchBuilder final : public IREmitter { void AVX128_VPINSRB(OpcodeArgs); void AVX128_VPINSRW(OpcodeArgs); void AVX128_VPINSRDQ(OpcodeArgs); + template + void AVX128_VPSRA(OpcodeArgs); + template + void AVX128_VPSLL(OpcodeArgs); + template + void AVX128_VPSRL(OpcodeArgs); + + void AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp); + void AVX128_VPSLLV(OpcodeArgs); + void AVX128_VPSRAVD(OpcodeArgs); + void AVX128_VPSRLV(OpcodeArgs); + + template + void AVX128_VPSRLI(OpcodeArgs); + template + void AVX128_VPSLLI(OpcodeArgs); + template + void AVX128_VPSRAI(OpcodeArgs); + + void AVX128_VPSRLDQ(OpcodeArgs); + void AVX128_VPSLLDQ(OpcodeArgs); // End of AVX 128-bit implementation diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index ab246e289f..760c743028 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -193,9 +193,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() { // TODO: {OPD(1, 0b01, 0xD0), 1, &OpDispatchBuilder::VADDSUBPOp<8>}, // TODO: {OPD(1, 0b11, 0xD0), 1, &OpDispatchBuilder::VADDSUBPOp<4>}, - // TODO: {OPD(1, 0b01, 0xD1), 1, &OpDispatchBuilder::VPSRLDOp<2>}, - // TODO: {OPD(1, 0b01, 0xD2), 1, &OpDispatchBuilder::VPSRLDOp<4>}, - // TODO: {OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::VPSRLDOp<8>}, + {OPD(1, 0b01, 0xD1), 1, &OpDispatchBuilder::AVX128_VPSRL<2>}, + {OPD(1, 0b01, 0xD2), 1, &OpDispatchBuilder::AVX128_VPSRL<4>}, + {OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::AVX128_VPSRL<8>}, {OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::AVX128_MOVQ}, @@ -211,8 +211,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0xDF), 1, &OpDispatchBuilder::AVX128_VANDN}, {OPD(1, 0b01, 0xE0), 1, &OpDispatchBuilder::AVX128_VectorALU}, - // TODO: {OPD(1, 0b01, 0xE1), 1, &OpDispatchBuilder::VPSRAOp<2>}, - // TODO: {OPD(1, 0b01, 0xE2), 1, &OpDispatchBuilder::VPSRAOp<4>}, + {OPD(1, 0b01, 0xE1), 1, &OpDispatchBuilder::AVX128_VPSRA<2>}, + {OPD(1, 0b01, 0xE2), 1, &OpDispatchBuilder::AVX128_VPSRA<4>}, {OPD(1, 0b01, 0xE3), 1, &OpDispatchBuilder::AVX128_VectorALU}, // TODO: {OPD(1, 0b01, 0xE4), 1, &OpDispatchBuilder::VPMULHWOp}, // TODO: {OPD(1, 0b01, 0xE5), 1, &OpDispatchBuilder::VPMULHWOp}, @@ -233,9 +233,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0xEF), 1, &OpDispatchBuilder::AVX128_VectorALU}, {OPD(1, 0b11, 0xF0), 1, &OpDispatchBuilder::AVX128_MOVVectorUnaligned}, - // TODO: {OPD(1, 0b01, 0xF1), 1, &OpDispatchBuilder::VPSLLOp<2>}, - // TODO: {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::VPSLLOp<4>}, - // TODO: {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::VPSLLOp<8>}, + {OPD(1, 0b01, 0xF1), 1, &OpDispatchBuilder::AVX128_VPSLL<2>}, + {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::AVX128_VPSLL<4>}, + {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::AVX128_VPSLL<8>}, // TODO: {OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::VPMULLOp<4, false>}, // TODO: {OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::VPMADDWDOp}, // TODO: {OPD(1, 0b01, 0xF6), 1, &OpDispatchBuilder::VPSADBWOp}, @@ -313,9 +313,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(2, 0b01, 0x40), 1, &OpDispatchBuilder::AVX128_VectorALU}, // TODO: {OPD(2, 0b01, 0x41), 1, &OpDispatchBuilder::PHMINPOSUWOp}, - // TODO: {OPD(2, 0b01, 0x45), 1, &OpDispatchBuilder::VPSRLVOp}, - // TODO: {OPD(2, 0b01, 0x46), 1, &OpDispatchBuilder::VPSRAVDOp}, - // TODO: {OPD(2, 0b01, 0x47), 1, &OpDispatchBuilder::VPSLLVOp}, + {OPD(2, 0b01, 0x45), 1, &OpDispatchBuilder::AVX128_VPSRLV}, + {OPD(2, 0b01, 0x46), 1, &OpDispatchBuilder::AVX128_VPSRAVD}, + {OPD(2, 0b01, 0x47), 1, &OpDispatchBuilder::AVX128_VPSLLV}, {OPD(2, 0b01, 0x58), 1, &OpDispatchBuilder::AVX128_VBROADCAST<4>}, {OPD(2, 0b01, 0x59), 1, &OpDispatchBuilder::AVX128_VBROADCAST<8>}, @@ -383,18 +383,18 @@ void OpDispatchBuilder::InstallAVX128Handlers() { #define OPD(group, pp, opcode) (((group - X86Tables::TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) static constexpr std::tuple VEX128TableGroupOps[] { - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b010), 1, &OpDispatchBuilder::VPSRLIOp<2>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b110), 1, &OpDispatchBuilder::VPSLLIOp<2>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b100), 1, &OpDispatchBuilder::VPSRAIOp<2>}, + {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b010), 1, &OpDispatchBuilder::AVX128_VPSRLI<2>}, + {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b110), 1, &OpDispatchBuilder::AVX128_VPSLLI<2>}, + {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b100), 1, &OpDispatchBuilder::AVX128_VPSRAI<2>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b010), 1, &OpDispatchBuilder::VPSRLIOp<4>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b110), 1, &OpDispatchBuilder::VPSLLIOp<4>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b100), 1, &OpDispatchBuilder::VPSRAIOp<4>}, + {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b010), 1, &OpDispatchBuilder::AVX128_VPSRLI<4>}, + {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b110), 1, &OpDispatchBuilder::AVX128_VPSLLI<4>}, + {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b100), 1, &OpDispatchBuilder::AVX128_VPSRAI<4>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b010), 1, &OpDispatchBuilder::VPSRLIOp<8>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b011), 1, &OpDispatchBuilder::VPSRLDQOp}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b110), 1, &OpDispatchBuilder::VPSLLIOp<8>}, - // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b111), 1, &OpDispatchBuilder::VPSLLDQOp}, + {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b010), 1, &OpDispatchBuilder::AVX128_VPSRLI<8>}, + {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b011), 1, &OpDispatchBuilder::AVX128_VPSRLDQ}, + {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b110), 1, &OpDispatchBuilder::AVX128_VPSLLI<8>}, + {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b111), 1, &OpDispatchBuilder::AVX128_VPSLLDQ}, // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b010), 1, &OpDispatchBuilder::LDMXCSR}, // TODO: {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b011), 1, &OpDispatchBuilder::STMXCSR}, @@ -613,6 +613,55 @@ void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, size_t SrcSize, size AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } +void OpDispatchBuilder::AVX128_VectorShiftWideImpl(OpcodeArgs, size_t ElementSize, IROps IROp) { + const auto Is128Bit = GetSrcSize(Op) == Core::CPUState::XMM_SSE_REG_SIZE; + + auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); + auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); + + // Incoming element size for the shift source is always 8-bytes in the lower register. + DeriveOp(Low, IROp, _VUShrSWide(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low)); + + RefPair Result {}; + Result.Low = Low; + + if (Is128Bit) { + Result.High = LoadZeroVector(OpSize::i128Bit); + } else { + DeriveOp(High, IROp, _VUShrSWide(OpSize::i128Bit, ElementSize, Src1.High, Src2.High)); + Result.High = High; + } + + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + +void OpDispatchBuilder::AVX128_VectorShiftImmImpl(OpcodeArgs, size_t ElementSize, IROps IROp) { + const auto DstSize = GetDstSize(Op); + const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const uint64_t ShiftConstant = Op->Src[1].Literal(); + + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); + RefPair Result {}; + + if (ShiftConstant == 0) [[unlikely]] { + Result = Src; + } else { + DeriveOp(Low, IROp, _VUShrI(OpSize::i128Bit, ElementSize, Src.Low, ShiftConstant)); + Result.Low = Low; + + if (!Is128Bit) { + DeriveOp(High, IROp, _VUShrI(OpSize::i128Bit, ElementSize, Src.Low, ShiftConstant)); + Result.High = High; + } + } + + if (Is128Bit) { + Result.High = LoadZeroVector(OpSize::i128Bit); + } + + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + template void OpDispatchBuilder::AVX128_VectorALU(OpcodeArgs) { AVX128_VectorALUImpl(Op, IROp, ElementSize); @@ -1209,4 +1258,96 @@ void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) { AVX128_PINSRImpl(Op, SrcSize, Op->Src[0], Op->Src[1], Op->Src[2]); } +template +void OpDispatchBuilder::AVX128_VPSRA(OpcodeArgs) { + AVX128_VectorShiftWideImpl(Op, ElementSize, IROps::OP_VSSHRSWIDE); +} + +template +void OpDispatchBuilder::AVX128_VPSLL(OpcodeArgs) { + AVX128_VectorShiftWideImpl(Op, ElementSize, IROps::OP_VUSHLSWIDE); +} + +template +void OpDispatchBuilder::AVX128_VPSRL(OpcodeArgs) { + AVX128_VectorShiftWideImpl(Op, ElementSize, IROps::OP_VUSHRSWIDE); +} + +void OpDispatchBuilder::AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp) { + AVX128_VectorBinaryImpl(Op, GetDstSize(Op), GetSrcSize(Op), [this, IROp](size_t ElementSize, Ref Src1, Ref Src2) { + DeriveOp(Shift, IROp, _VUShr(OpSize::i128Bit, ElementSize, Src1, Src2, true)); + return Shift; + }); +} + +void OpDispatchBuilder::AVX128_VPSLLV(OpcodeArgs) { + AVX128_VariableShiftImpl(Op, IROps::OP_VUSHL); +} + +void OpDispatchBuilder::AVX128_VPSRAVD(OpcodeArgs) { + AVX128_VariableShiftImpl(Op, IROps::OP_VSSHR); +} + +void OpDispatchBuilder::AVX128_VPSRLV(OpcodeArgs) { + AVX128_VariableShiftImpl(Op, IROps::OP_VUSHR); +} + +template +void OpDispatchBuilder::AVX128_VPSRLI(OpcodeArgs) { + AVX128_VectorShiftImmImpl(Op, ElementSize, IROps::OP_VUSHRI); +} + +template +void OpDispatchBuilder::AVX128_VPSLLI(OpcodeArgs) { + AVX128_VectorShiftImmImpl(Op, ElementSize, IROps::OP_VSHLI); +} + +template +void OpDispatchBuilder::AVX128_VPSRAI(OpcodeArgs) { + AVX128_VectorShiftImmImpl(Op, ElementSize, IROps::OP_VSSHRI); +} + +void OpDispatchBuilder::AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir) { + const auto DstSize = GetDstSize(Op); + const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; + const bool Right = Dir == ShiftDirection::RIGHT; + + const uint64_t Shift = Op->Src[1].Literal(); + const uint64_t ExtrShift = Right ? Shift : OpSize::i128Bit - Shift; + + auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); + + RefPair Result {}; + if (Shift == 0) [[unlikely]] { + Result = Src; + } else if (Shift >= Core::CPUState::XMM_SSE_REG_SIZE) { + Result.Low = LoadZeroVector(OpSize::i128Bit); + Result.High = Result.High; + } else { + Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); + RefPair Zero {ZeroVector, ZeroVector}; + RefPair Src1 = Right ? Zero : Src; + RefPair Src2 = Right ? Src : Zero; + + Result.Low = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1.Low, Src2.Low, ExtrShift); + if (!Is128Bit) { + Result.High = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1.High, Src2.High, ExtrShift); + } + } + + if (Is128Bit) { + Result.High = LoadZeroVector(OpSize::i128Bit); + } + + AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); +} + +void OpDispatchBuilder::AVX128_VPSRLDQ(OpcodeArgs) { + AVX128_ShiftDoubleImm(Op, ShiftDirection::RIGHT); +} + +void OpDispatchBuilder::AVX128_VPSLLDQ(OpcodeArgs) { + AVX128_ShiftDoubleImm(Op, ShiftDirection::LEFT); +} + } // namespace FEXCore::IR