Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize DF representation #3469

Merged
merged 8 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ namespace FEXCore::Context {
case X86State::RFLAG_ZF_RAW_LOC:
case X86State::RFLAG_SF_RAW_LOC:
case X86State::RFLAG_OF_RAW_LOC:
case X86State::RFLAG_DF_RAW_LOC:
// Intentionally do nothing.
// These contain multiple bits which can corrupt other members when compacted.
break;
Expand Down Expand Up @@ -215,6 +216,11 @@ namespace FEXCore::Context {
uint32_t AF = ((Frame->State.af_raw ^ PFByte) & (1 << 4)) ? 1 : 0;
EFLAGS |= AF << X86State::RFLAG_AF_RAW_LOC;

// DF is pretransformed, undo the transform from 1/-1 back to 0/1
uint8_t DFByte = Frame->State.flags[X86State::RFLAG_DF_RAW_LOC];
if (DFByte & 0x80)
EFLAGS |= 1 << X86State::RFLAG_DF_RAW_LOC;

return EFLAGS;
}

Expand All @@ -238,6 +244,10 @@ namespace FEXCore::Context {
// PF is inverted in our internal representation.
Frame->State.pf_raw = (EFLAGS & (1U << i)) ? 0 : 1;
break;
case X86State::RFLAG_DF_RAW_LOC:
// DF is encoded as 1/-1
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 0xff : 1;
break;
default:
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 1 : 0;
break;
Expand Down
24 changes: 16 additions & 8 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,14 @@ DEF_OP(StoreNZCV) {
msr(ARMEmitter::SystemRegister::NZCV, GetReg(Op->Value.ID()));
}

DEF_OP(LoadDF) {
auto Dst = GetReg(Node);
auto Flag = X86State::RFLAG_DF_RAW_LOC;

// DF needs sign extension to turn 0x1/0xFF into 1/-1
ldrsb(Dst.X(), STATE, offsetof(FEXCore::Core::CPUState, flags[Flag]));
}

DEF_OP(LoadFlag) {
auto Op = IROp->C<IR::IROp_LoadFlag>();
auto Dst = GetReg(Node);
Expand Down Expand Up @@ -1708,7 +1716,7 @@ DEF_OP(MemSet) {
DirectionReg = GetReg(Op->Direction.ID());
}

// If Direction == 0 then:
// If Direction > 0 then:
// MemReg is incremented (by size)
// else:
// MemReg is decremented (by size)
Expand All @@ -1729,7 +1737,7 @@ DEF_OP(MemSet) {

if (!DirectionIsInline) {
// Backward or forwards implementation depends on flag
cbnz(ARMEmitter::Size::i64Bit, DirectionReg, &BackwardImpl);
tbnz(DirectionReg, 1, &BackwardImpl);
}

auto MemStore = [this](auto Value, uint32_t OpSize, int32_t Size) {
Expand Down Expand Up @@ -1847,8 +1855,8 @@ DEF_OP(MemSet) {
};

if (DirectionIsInline) {
// If the direction constant is set then the direction is negative.
EmitMemset(DirectionConstant ? -1 : 1);
LOGMAN_THROW_AA_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction");
EmitMemset(DirectionConstant);
}
else {
// Emit forward direction memset then backward direction memset.
Expand Down Expand Up @@ -1886,7 +1894,7 @@ DEF_OP(MemCpy) {
}

auto Dst = GetRegPair(Node);
// If Direction == 0 then:
// If Direction > 0 then:
// MemRegDest is incremented (by size)
// MemRegSrc is incremented (by size)
// else:
Expand Down Expand Up @@ -1922,7 +1930,7 @@ DEF_OP(MemCpy) {

if (!DirectionIsInline) {
// Backward or forwards implementation depends on flag
cbnz(ARMEmitter::Size::i64Bit, DirectionReg, &BackwardImpl);
tbnz(DirectionReg, 1, &BackwardImpl);
}

auto MemCpy = [this](uint32_t OpSize, int32_t Size) {
Expand Down Expand Up @@ -2121,8 +2129,8 @@ DEF_OP(MemCpy) {
};

if (DirectionIsInline) {
// If the direction constant is set then the direction is negative.
EmitMemcpy(DirectionConstant ? -1 : 1);
LOGMAN_THROW_AA_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction");
EmitMemcpy(DirectionConstant);
}
else {
// Emit forward direction memset then backward direction memset.
Expand Down
61 changes: 15 additions & 46 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,10 +1379,10 @@ void OpDispatchBuilder::FLAGControlOp(OpcodeArgs) {
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_CF_RAW_LOC);
break;
case 0xFC: // CLD
SetRFLAG(_Constant(0), FEXCore::X86State::RFLAG_DF_LOC);
SetRFLAG(_Constant(0), FEXCore::X86State::RFLAG_DF_RAW_LOC);
break;
case 0xFD: // STD
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_DF_LOC);
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_DF_RAW_LOC);
break;
}
}
Expand Down Expand Up @@ -3627,7 +3627,8 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
const bool Repeat = (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) != 0;

if (!Repeat) {
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
// Src is used only for a store of the same size so allow garbage
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Dest = LoadGPRRegister(X86State::REG_RDI);

// Only ES prefix
Expand All @@ -3636,16 +3637,9 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
// Store to memory where RDI points
_StoreMemAutoTSO(GPRClass, Size, Dest, Src, Size);

// Calculate direction.
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest = LoadGPRRegister(X86State::REG_RDI);
TailDest = _Add(OpSize::i64Bit, TailDest, PtrDir);

StoreGPRRegister(X86State::REG_RDI, TailDest);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, Size));
}
else {
// FEX doesn't support partial faulting REP instructions.
Expand All @@ -3658,9 +3652,8 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
auto Segment = GetSegment(0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);

OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);

auto Result = _MemSet(CTX->IsAtomicTSOEnabled(), Size, Segment ?: InvalidNode, Dest, Src, Counter, DF);
auto Result = _MemSet(CTX->IsAtomicTSOEnabled(), Size, Segment ?: InvalidNode, Dest, Src, Counter, LoadDir(1));
StoreGPRRegister(X86State::REG_RCX, _Constant(0));
StoreGPRRegister(X86State::REG_RDI, Result);
}
Expand All @@ -3676,9 +3669,6 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
// RA now can handle these to be here, to avoid DF accesses
const auto Size = GetSrcSize(Op);

// Calculate direction.
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);

if (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) {
auto SrcAddr = LoadGPRRegister(X86State::REG_RSI);
auto DstAddr = LoadGPRRegister(X86State::REG_RDI);
Expand All @@ -3690,7 +3680,8 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
auto Result = _MemCpy(CTX->IsAtomicTSOEnabled(), Size,
DstSegment ?: InvalidNode,
SrcSegment ?: InvalidNode,
DstAddr, SrcAddr, Counter, DF);
DstAddr, SrcAddr, Counter,
LoadDir(1));

OrderedNode *Result_Dst = _ExtractElementPair(OpSize::i64Bit, Result, 0);
OrderedNode *Result_Src = _ExtractElementPair(OpSize::i64Bit, Result, 1);
Expand All @@ -3700,9 +3691,6 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RSI, Result_Src);
}
else {
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

OrderedNode *RSI = LoadGPRRegister(X86State::REG_RSI);
OrderedNode *RDI = LoadGPRRegister(X86State::REG_RDI);
RDI= AppendSegmentOffset(RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);
Expand All @@ -3713,6 +3701,7 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
// Store to memory where RDI points
_StoreMemAutoTSO(GPRClass, Size, RDI, Src, Size);

auto PtrDir = LoadDir(Size);
RSI = _Add(OpSize::i64Bit, RSI, PtrDir);
RDI = _Add(OpSize::i64Bit, RDI, PtrDir);

Expand Down Expand Up @@ -3745,9 +3734,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {

GenerateFlags_SUB(Op, Src2, Src1);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir);
Expand All @@ -3764,9 +3751,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down Expand Up @@ -3856,15 +3841,9 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {

StoreResult(GPRClass, Op, Src, -1);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);

TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, PtrDir);
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, Size));
}
else {
// Calculate flags early. because end of block
Expand All @@ -3877,9 +3856,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
// May or may not matter

// Read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down Expand Up @@ -3953,15 +3930,9 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {

GenerateFlags_SUB(Op, Src1, Src2);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);

TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, PtrDir);
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, Size));
}
else {
// Calculate flags early. because end of block
Expand All @@ -3970,9 +3941,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down
26 changes: 26 additions & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,11 @@ friend class FEXCore::IR::PassManager;
if (ValueOffset || MustMask)
Value = _Bfe(OpSize::i32Bit, 1, ValueOffset, Value);

// For DF, we need to transform 0/1 into 1/-1
if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) {
Value = _SubShift(OpSize::i64Bit, _Constant(1), Value, ShiftType::LSL, 1);
}

_StoreFlag(Value, BitOffset);
}
}
Expand Down Expand Up @@ -1453,11 +1458,32 @@ friend class FEXCore::IR::PassManager;
return _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
} else if (BitOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) {
return _LoadRegister(false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
} else if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) {
// Recover the sign bit, it is the logical DF value
return _Lshr(OpSize::i64Bit, _LoadDF(), _Constant(63));
} else {
return _LoadFlag(BitOffset);
}
}

// Returns (DF ? -Size : Size)
OrderedNode *LoadDir(const unsigned Size) {
auto Dir = _LoadDF();
auto Shift = FEXCore::ilog2(Size);

if (Shift)
return _Lshl(IR::SizeToOpSize(CTX->GetGPRSize()), Dir, _Constant(Shift));
else
return Dir;
}

// Returns DF ? (X - Size) : (X + Size)
OrderedNode *OffsetByDir(OrderedNode *X, const unsigned Size) {
auto Shift = FEXCore::ilog2(Size);

return _AddShift(OpSize::i64Bit, X, _LoadDF(), ShiftType::LSL, Shift);
}

// Set SSE comparison flags based on the result set by Arm FCMP. This converts
// NZCV from the Arm representation to an eXternal representation that's
// totally not a euphemism for x86 or anything, nuh-uh.
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ constexpr std::array<uint32_t, 17> FlagOffsets = {
FEXCore::X86State::RFLAG_SF_RAW_LOC,
FEXCore::X86State::RFLAG_TF_LOC,
FEXCore::X86State::RFLAG_IF_LOC,
FEXCore::X86State::RFLAG_DF_LOC,
FEXCore::X86State::RFLAG_DF_RAW_LOC,
FEXCore::X86State::RFLAG_OF_RAW_LOC,
FEXCore::X86State::RFLAG_IOPL_LOC,
FEXCore::X86State::RFLAG_NT_LOC,
Expand Down
7 changes: 7 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,13 @@
"DestSize": "4"
},

"GPR = LoadDF": {
"Desc": ["Loads the decimal flag from the context object in -1/1",
"representation for easy consumption"
],
"DestSize": "8"
},

"GPR = LoadFlag u32:$Flag": {
"Desc": ["Loads an x86-64 flag from the context object",
"Specialized to allow flexible implementation of flag handling"
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
if (IREmit->IsValueConstant(Op->Direction, &Constant)) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Direction));

IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant & 1));
IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant));

Changed = true;
}
Expand All @@ -1344,7 +1344,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
if (IREmit->IsValueConstant(Op->Direction, &Constant)) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Direction));

IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant & 1));
IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant));

Changed = true;
}
Expand Down
Loading
Loading