Skip to content

Commit

Permalink
Merge pull request #3542 from alyssarosenzweig/ra/rep
Browse files Browse the repository at this point in the history
Eliminate xblock liveness with rep cmp/lod/scas
  • Loading branch information
Sonicadvance1 authored Apr 2, 2024
2 parents 29c6281 + ad0dd34 commit e8abc88
Show file tree
Hide file tree
Showing 8 changed files with 1,012 additions and 502 deletions.
18 changes: 18 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,24 @@ DEF_OP(CondAddNZCV) {
}
}

DEF_OP(CondSubNZCV) {
auto Op = IROp->C<IR::IROp_CondSubNZCV>();
const auto OpSize = IROp->Size;

LOGMAN_THROW_AA_FMT(OpSize == IR::i32Bit || OpSize == IR::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

ARMEmitter::StatusFlags Flags = (ARMEmitter::StatusFlags)Op->FalseNZCV;
uint64_t Const = 0;
auto Src1 = GetZeroableReg(Op->Src1);

if (IsInlineConstant(Op->Src2, &Const)) {
ccmp(EmitSize, Src1, Const, Flags, MapSelectCC(Op->Cond));
} else {
ccmp(EmitSize, Src1, GetReg(Op->Src2.ID()), Flags, MapSelectCC(Op->Cond));
}
}

DEF_OP(Neg) {
auto Op = IROp->C<IR::IROp_Neg>();
const uint8_t OpSize = IROp->Size;
Expand Down
299 changes: 156 additions & 143 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3548,75 +3548,90 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {

bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto PtrDir = LoadDir(Size);
// If rcx = 0, skip the whole loop.
OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
auto OuterJump = CondJump(Counter, {COND_EQ});

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(JumpStart, LoopStart);
SetCurrentCodeBlock(LoopStart);
auto BeforeLoop = CreateNewCodeBlockAfter(GetCurrentBlock());
SetFalseJumpTarget(OuterJump, BeforeLoop);
SetCurrentCodeBlock(BeforeLoop);
StartNewBlock();

OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
ForeachDirection([this, Op, Size, REPE](int PtrDir) {
IRPair<IROp_CondJump> InnerJump;
auto JumpIntoLoop = Jump();

// Can we end the block?
auto CondJump_ = CondJump(Counter, {COND_EQ});
IRPair<IROp_CondJump> InternalCondJump;

auto LoopTail = CreateNewCodeBlockAfter(LoopStart);
SetFalseJumpTarget(CondJump_, LoopTail);
SetCurrentCodeBlock(LoopTail);
StartNewBlock();
// Setup for the loop
auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock());
SetCurrentCodeBlock(LoopHeader);
StartNewBlock();
SetJumpTarget(JumpIntoLoop, LoopHeader);

// Working loop
{
OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI);
OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI);
// Working loop
{
OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI);
OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI);

// Only ES prefix
Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);
// Default DS prefix
Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX);
// Only ES prefix
Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);
// Default DS prefix
Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX);

auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size);
auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size);
auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size);
auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size);

GenerateFlags_SUB(Op, Src2, Src1);
// We'll calculate PF/AF after the loop, so use them as temporaries here.
_StoreRegister(Src1, false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
_StoreRegister(Src2, false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());

// Calculate flags early.
CalculateDeferredFlags();
OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);

OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);
// Decrement counter
TailCounter = _SubWithFlags(OpSize::i64Bit, TailCounter, _Constant(1));

// Decrement counter
TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1));
// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);

// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size));
StoreGPRRegister(X86State::REG_RDI, Dest_RDI);

// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir);
StoreGPRRegister(X86State::REG_RDI, Dest_RDI);
// Offset second pointer
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size));
StoreGPRRegister(X86State::REG_RSI, Dest_RSI);

// Offset second pointer
Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, PtrDir);
StoreGPRRegister(X86State::REG_RSI, Dest_RSI);
// If TailCounter != 0, compare sources.
// If TailCounter == 0, set ZF iff that would break.
_CondSubNZCV(OpSize::i64Bit, Src2, Src1, {COND_NEQ}, REPE ? 0 : (1 << 2) /* Z */);
CachedNZCV = nullptr;
NZCVDirty = false;
InnerJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ});

CalculateDeferredFlags();
InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ});
// Jump back to the start if we have more work to do
SetTrueJumpTarget(InnerJump, LoopHeader);
}

// Jump back to the start if we have more work to do
SetTrueJumpTarget(InternalCondJump, LoopStart);
}
// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock());
SetFalseJumpTarget(InnerJump, LoopEnd);
SetCurrentCodeBlock(LoopEnd);
StartNewBlock();
});

// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(LoopTail);
SetTrueJumpTarget(CondJump_, LoopEnd);

SetFalseJumpTarget(InternalCondJump, LoopEnd);
{
// Grab the sources from the last iteration so we can set flags.
auto Src1 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
auto Src2 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
GenerateFlags_SUB(Op, Src2, Src1);
CalculateDeferredFlags();
}
auto Jump_ = Jump();

SetCurrentCodeBlock(LoopEnd);
auto Exit = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(Jump_, Exit);
SetTrueJumpTarget(OuterJump, Exit);
SetCurrentCodeBlock(Exit);
StartNewBlock();
}
}
Expand Down Expand Up @@ -3647,65 +3662,64 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
// Calculate flags early. because end of block
CalculateDeferredFlags();

// XXX: Theoretically LODS could be optimized to
// RSI += {-}(RCX * Size)
// RAX = [RSI - Size]
// But this might violate the case of an application scanning pages for read permission and catching the fault
// May or may not matter

// Read DF once
auto PtrDir = LoadDir(Size);
ForeachDirection([this, Op, Size](int PtrDir) {
// XXX: Theoretically LODS could be optimized to
// RSI += {-}(RCX * Size)
// RAX = [RSI - Size]
// But this might violate the case of an application scanning pages for read permission and catching the fault
// May or may not matter

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(JumpStart, LoopStart);
SetCurrentCodeBlock(LoopStart);
StartNewBlock();
auto JumpStart = Jump();
// Make sure to start a new block after ending this one
auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(JumpStart, LoopStart);
SetCurrentCodeBlock(LoopStart);
StartNewBlock();

OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);

// Can we end the block?
// Can we end the block?

// We leave if RCX = 0
auto CondJump_ = CondJump(Counter, {COND_EQ});
// We leave if RCX = 0
auto CondJump_ = CondJump(Counter, {COND_EQ});

auto LoopTail = CreateNewCodeBlockAfter(LoopStart);
SetFalseJumpTarget(CondJump_, LoopTail);
SetCurrentCodeBlock(LoopTail);
StartNewBlock();
auto LoopTail = CreateNewCodeBlockAfter(LoopStart);
SetFalseJumpTarget(CondJump_, LoopTail);
SetCurrentCodeBlock(LoopTail);
StartNewBlock();

// Working loop
{
OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI);
// Working loop
{
OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI);

Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX);
Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX);

auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size);
auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size);

StoreResult(GPRClass, Op, Src, -1);
StoreResult(GPRClass, Op, Src, -1);

OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);
OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);

// Decrement counter
TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1));
// Decrement counter
TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1));

// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);

// Offset the pointer
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, PtrDir);
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);
// Offset the pointer
TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size));
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);

// Jump back to the start, we have more work to do
Jump(LoopStart);
}
// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(LoopTail);
SetTrueJumpTarget(CondJump_, LoopEnd);
SetCurrentCodeBlock(LoopEnd);
StartNewBlock();
// Jump back to the start, we have more work to do
Jump(LoopStart);
}
// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(LoopTail);
SetTrueJumpTarget(CondJump_, LoopEnd);
SetCurrentCodeBlock(LoopEnd);
StartNewBlock();
});
}
}

Expand Down Expand Up @@ -3736,71 +3750,70 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
// Calculate flags early. because end of block
CalculateDeferredFlags();

bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto PtrDir = LoadDir(Size);
ForeachDirection([this, Op, Size](int Dir){
bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(JumpStart, LoopStart);
SetCurrentCodeBlock(LoopStart);
StartNewBlock();
auto JumpStart = Jump();
// Make sure to start a new block after ending this one
auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock());
SetJumpTarget(JumpStart, LoopStart);
SetCurrentCodeBlock(LoopStart);
StartNewBlock();

OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);

// Can we end the block?
// We leave if RCX = 0
auto CondJump_ = CondJump(Counter, {COND_EQ});
IRPair<IROp_CondJump> InternalCondJump;
// Can we end the block?
// We leave if RCX = 0
auto CondJump_ = CondJump(Counter, {COND_EQ});
IRPair<IROp_CondJump> InternalCondJump;

auto LoopTail = CreateNewCodeBlockAfter(LoopStart);
SetFalseJumpTarget(CondJump_, LoopTail);
SetCurrentCodeBlock(LoopTail);
StartNewBlock();
auto LoopTail = CreateNewCodeBlockAfter(LoopStart);
SetFalseJumpTarget(CondJump_, LoopTail);
SetCurrentCodeBlock(LoopTail);
StartNewBlock();

// Working loop
{
OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI);
// Working loop
{
OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI);

Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);
Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);

auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size);
auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size);

GenerateFlags_SUB(Op, Src1, Src2);
GenerateFlags_SUB(Op, Src1, Src2);

// Calculate flags early.
CalculateDeferredFlags();
// Calculate flags early.
CalculateDeferredFlags();

OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);
OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX);
OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);

// Decrement counter
TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1));
// Decrement counter
TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1));

// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);
// Store the counter since we don't have phis
StoreGPRRegister(X86State::REG_RCX, TailCounter);

// Offset the pointer
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, PtrDir);
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);
// Offset the pointer
TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size));
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);

CalculateDeferredFlags();
InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ});
CalculateDeferredFlags();
InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ});

// Jump back to the start if we have more work to do
SetTrueJumpTarget(InternalCondJump, LoopStart);
}
// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(LoopTail);
SetTrueJumpTarget(CondJump_, LoopEnd);
// Jump back to the start if we have more work to do
SetTrueJumpTarget(InternalCondJump, LoopStart);
}
// Make sure to start a new block after ending this one
auto LoopEnd = CreateNewCodeBlockAfter(LoopTail);
SetTrueJumpTarget(CondJump_, LoopEnd);

SetFalseJumpTarget(InternalCondJump, LoopEnd);
SetFalseJumpTarget(InternalCondJump, LoopEnd);

SetCurrentCodeBlock(LoopEnd);
StartNewBlock();
SetCurrentCodeBlock(LoopEnd);
StartNewBlock();
});
}
}

Expand Down
Loading

0 comments on commit e8abc88

Please sign in to comment.