From 20663ee6b6c3aad2ce60a7f3cc616692df07c93b Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Mon, 17 Jun 2024 23:05:30 -0700
Subject: [PATCH] AVX128: Implement support for vcmps{s,d}

---
 .../Source/Interface/Core/OpcodeDispatcher.h  |  3 +
 .../Core/OpcodeDispatcher/AVX_128.cpp         | 84 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
index 934b5c38cd..e9fb340bfa 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -1032,6 +1032,9 @@ class OpDispatchBuilder final : public IREmitter {
   Ref AVX128_VFCMPImpl(size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType);
   template<size_t ElementSize>
   void AVX128_VFCMP(OpcodeArgs);
+  Ref AVX128_InsertScalarFCMPImpl(size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType);
+  template<size_t ElementSize>
+  void AVX128_InsertScalarFCMP(OpcodeArgs);
 
   // End of AVX 128-bit implementation
 
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
index 70d0cdedfa..a07bdeb21d 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@@ -181,8 +181,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
 
     {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<4>},
     {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVX128_VFCMP<8>},
-    // TODO: {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<4>},
-    // TODO: {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp<8>},
+    {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<4>},
+    {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVX128_InsertScalarFCMP<8>},
 
     // TODO: {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::VPINSRWOp},
     // TODO: {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::PExtrOp<2>},
@@ -1266,4 +1266,84 @@ void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs) {
   AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
 }
 
+Ref OpDispatchBuilder::AVX128_InsertScalarFCMPImpl(size_t ElementSize, Ref Src1, Ref Src2, uint8_t CompType) {
+  switch (CompType) {
+  case 0x00:
+  case 0x08:
+  case 0x10:
+  case 0x18: // EQ
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::EQ, false);
+  case 0x01:
+  case 0x09:
+  case 0x11:
+  case 0x19: // LT, GT(Swapped operand)
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::LT, false);
+  case 0x02:
+  case 0x0A:
+  case 0x12:
+  case 0x1A: // LE, GE(Swapped operand)
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::LE, false);
+  case 0x03:
+  case 0x0B:
+  case 0x13:
+  case 0x1B: // Unordered
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::UNO, false);
+  case 0x04:
+  case 0x0C:
+  case 0x14:
+  case 0x1C: // NEQ
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::NEQ, false);
+  case 0x05:
+  case 0x0D:
+  case 0x15:
+  case 0x1D: { // NLT, NGT(Swapped operand)
+    Ref Result = _VFCMPLT(ElementSize, ElementSize, Src1, Src2);
+    Result = _VNot(ElementSize, ElementSize, Result);
+    // Insert the lower bits
+    return _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Src1, Result);
+  }
+  case 0x06:
+  case 0x0E:
+  case 0x16:
+  case 0x1E: { // NLE, NGE(Swapped operand)
+    Ref Result = _VFCMPLE(ElementSize, ElementSize, Src1, Src2);
+    Result = _VNot(ElementSize, ElementSize, Result);
+    // Insert the lower bits
+    return _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Src1, Result);
+  }
+  case 0x07:
+  case 0x0F:
+  case 0x17:
+  case 0x1F: // Ordered
+    return _VFCMPScalarInsert(OpSize::i128Bit, ElementSize, Src1, Src2, FloatCompareOp::ORD, false);
+  default: LOGMAN_MSG_A_FMT("Unknown Comparison type: {}", CompType); break;
+  }
+  FEX_UNREACHABLE;
+}
+
+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_InsertScalarFCMP(OpcodeArgs) {
+  // We load the full vector width when dealing with a source vector,
+  // so that we don't do any unnecessary zero extension to the scalar
+  // element that we're going to operate on.
+  const auto SrcSize = GetSrcSize(Op);
+
+  auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
+  RefPair Src2 {};
+
+  if (Op->Src[1].IsGPR()) {
+    Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false);
+  } else {
+    Src2.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags);
+  }
+
+  LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "Src[2] needs to be literal");
+  const uint8_t CompType = Op->Src[2].Data.Literal.Value;
+
+  RefPair Result {};
+  Result.Low = AVX128_InsertScalarFCMPImpl(ElementSize, Src1.Low, Src2.Low, CompType);
+  Result.High = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO);
+  AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
+}
+
 } // namespace FEXCore::IR