Skip to content

Commit

Permalink
Merge pull request #3536 from alyssarosenzweig/ra/rcl-rcr
Browse files Browse the repository at this point in the history
OpcodeDispatcher: eliminate xblock liveness for rcl/rcr
  • Loading branch information
Sonicadvance1 authored Apr 1, 2024
2 parents b1ddd8c + d3b2ddf commit cd9ffd2
Show file tree
Hide file tree
Showing 6 changed files with 437 additions and 381 deletions.
104 changes: 65 additions & 39 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2386,21 +2386,21 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {

// Calculate flags early.
CalculateDeferredFlags();

OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
const auto OpSize = OpSizeFromSrc(Op);

// Res = Src >> Shift
OrderedNode *Res = _Lshr(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
uint64_t Const;
if (IsValueConstant(WrapNode(Src), &Const)) {
Const &= Mask;
if (!Const)
return;

OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});

// Res = Src >> Shift
OrderedNode *Res = _Lshr(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

InvalidateDeferredFlags();

// Constant folded version of the above, with fused shifts.
Expand All @@ -2425,19 +2425,31 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
}

OrderedNode *SrcMasked = _And(OpSize, Src, _Constant(Size, Mask));
CalculateFlags_ShiftVariable(SrcMasked, [this, CF, Op, Size, OpSize, SrcMasked, Dest, &Res](){
Calculate_ShiftVariable(SrcMasked, [this, Op, Size, OpSize](){
// Rematerialize loads to avoid crossblock liveness
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});

// Res = Src >> Shift
OrderedNode *Res = _Lshr(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

auto One = _Constant(Size, 1);

// Res |= (SrcMasked << (Size - Shift + 1));
// Expressed as Res | ((SrcMasked << (Size - Shift)) << 1) to get correct
// Res |= (Dest << (Size - Shift + 1));
// Expressed as Res | ((Src << (Size - Shift)) << 1) to get correct
// behaviour for Shift without clobbering NZCV. Then observe that modulo
// Size, Size - Shift = -Shift so we can use a simple Neg.
OrderedNode *NegSrc = _Neg(OpSize, SrcMasked);
//
// The masking of Lshl means we don't need mask the source, since:
//
// -(x & Mask) & Mask = (-x) & Mask
OrderedNode *NegSrc = _Neg(OpSize, Src);
Res = _Orlshl(OpSize, Res, _Lshl(OpSize, Dest, NegSrc), 1);

// Our new CF will be bit (Shift - 1) of the source. this is hoisted up to
// avoid the need to copy the source.
auto NewCF = _Lshr(OpSize, Dest, _Sub(OpSize, SrcMasked, One));
// avoid the need to copy the source. Again, the Lshr absorbs the masking.
auto NewCF = _Lshr(OpSize, Dest, _Sub(OpSize, Src, One));
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(NewCF, 0, true);

// Since shift != 0 we can inject the CF
Expand All @@ -2453,10 +2465,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) {
}

void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {
// Calculate flags early. Need to get flags outside of
// CalculateFlags_ShiftVariable because it will invalidate.
CalculateDeferredFlags();
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

const auto Size = GetSrcBitSize(Op);

Expand All @@ -2466,7 +2475,12 @@ void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {

// CF only changes if we actually shifted. OF undefined if we didn't shift.
// The result is unchanged if we didn't shift. So branch over the whole thing.
CalculateFlags_ShiftVariable(Src, [this, CF, Op, Size, Src](){
Calculate_ShiftVariable(Src, [this, Op, Size](){
// Rematerialized to avoid crossblock liveness
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});

auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags);
OrderedNode *Tmp{};

Expand Down Expand Up @@ -2528,26 +2542,28 @@ void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {

// Entire bitfield has been setup
// Just extract the 8 or 16bits we need
OrderedNode *Res = _Lshr(OpSize::i64Bit, Tmp, Src);
OrderedNode *Res = _Lshr(OpSize::i32Bit, Tmp, Src);

StoreResult(GPRClass, Op, Res, -1);

uint64_t SrcConst;
bool IsSrcConst = IsValueConstant(WrapNode(Src), &SrcConst);
SrcConst &= 0x1f;

// Our new CF will be bit (Shift - 1) of the source
// Our new CF will be bit (Shift - 1) of the source. 32-bit Lshr masks the
// same as x86, but if we constant fold we must mask ourselves.
if (IsSrcConst) {
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(Tmp, SrcConst - 1, true);
} else {
auto One = _Constant(Size, 1);
auto NewCF = _Lshr(OpSize::i64Bit, Tmp, _Sub(OpSize::i32Bit, Src, One));
auto NewCF = _Lshr(OpSize::i32Bit, Tmp, _Sub(OpSize::i32Bit, Src, One));
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(NewCF, 0, true);
}

// OF is the top two MSBs XOR'd together
// Only when Shift == 1, it is undefined otherwise
if (!IsSrcConst || SrcConst == 1) {
auto NewOF = _XorShift(IR::SizeToOpSize(std::max<uint8_t>(4u, GetOpSize(Res))), Res, Res, ShiftType::LSR, 1);
auto NewOF = _XorShift(OpSize::i32Bit, Res, Res, ShiftType::LSR, 1);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(NewOF, Size - 2, true);
}
});
Expand Down Expand Up @@ -2590,19 +2606,19 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
CalculateDeferredFlags();

OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
const auto OpSize = OpSizeFromSrc(Op);

// Res = Src << Shift
OrderedNode *Res = _Lshl(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

uint64_t Const;
if (IsValueConstant(WrapNode(Src), &Const)) {
Const &= Mask;
if (!Const)
return;

// Res = Src << Shift
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Res = _Lshl(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

InvalidateDeferredFlags();

// Res |= (Src << (Size - Shift + 1));
Expand All @@ -2627,20 +2643,28 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
}

OrderedNode *SrcMasked = _And(OpSize, Src, _Constant(Size, Mask));
CalculateFlags_ShiftVariable(SrcMasked, [this, CF, Op, Size, OpSize, SrcMasked, Dest, &Res](){
// Res |= (SrcMasked >> (Size - Shift + 1)), expressed as
// Res | ((SrcMasked >> (-Shift)) >> 1), since Size - Shift = -Shift mod
// Size.
auto NegSrc = _Neg(OpSize, SrcMasked);
Calculate_ShiftVariable(SrcMasked, [this, Op, Size, OpSize](){
// Rematerialized to avoid crossblock liveness
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});

// Res = Src << Shift
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Res = _Lshl(OpSize, Dest, Src);
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

// Res |= (Dest >> (Size - Shift + 1)), expressed as
// Res | ((Dest >> (-Shift)) >> 1), since Size - Shift = -Shift mod
// Size. The shift aborbs the masking.
auto NegSrc = _Neg(OpSize, Src);
Res = _Orlshr(OpSize, Res, _Lshr(OpSize, Dest, NegSrc), 1);

// Our new CF will be bit (Shift - 1) of the source
auto NewCF = _Lshr(OpSize, Dest, NegSrc);
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(NewCF, 0, true);

// Since Shift != 0 we can inject the CF
OrderedNode *CFShl = _Sub(OpSize, SrcMasked, _Constant(Size, 1));
auto TmpCF = _Lshl(OpSize::i64Bit, CF, CFShl);
// Since Shift != 0 we can inject the CF. Shift absorbs the masking.
OrderedNode *CFShl = _Sub(OpSize, Src, _Constant(Size, 1));
auto TmpCF = _Lshl(OpSize, CF, CFShl);
Res = _Or(OpSize, Res, TmpCF);

// OF is the top two MSBs XOR'd together
Expand All @@ -2656,10 +2680,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) {
}

void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {
// Calculate flags early. Get CF outside the CalculateFlags_ShiftVariable
// since that invalidates flags.
CalculateDeferredFlags();
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

const auto Size = GetSrcBitSize(Op);

Expand All @@ -2669,9 +2690,14 @@ void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {

// CF only changes if we actually shifted. OF undefined if we didn't shift.
// The result is unchanged if we didn't shift. So branch over the whole thing.
CalculateFlags_ShiftVariable(Src, [this, CF, Op, Size, Src](){
Calculate_ShiftVariable(Src, [this, Op, Size](){
// Rematerialized to avoid crossblock liveness
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
Src = AndConst(OpSize::i32Bit, Src, 0x1F);
OrderedNode *Dest = LoadSource(GPRClass, Op, Op->Dest, Op->Flags);

auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);

OrderedNode *Tmp = _Constant(64, 0);

for (size_t i = 0; i < (32 + Size + 1); i += (Size + 1)) {
Expand All @@ -2690,7 +2716,7 @@ void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) {
// Shift 1 more bit that expected to get our result
// Shifting to the right will now behave like a rotate to the left
// Which we emulate with a _Ror
OrderedNode *Res = _Ror(OpSize::i64Bit, Tmp, _Sub(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, _Constant(Size, 64), Src));
OrderedNode *Res = _Ror(OpSize::i64Bit, Tmp, _Neg(OpSize::i32Bit, Src));

StoreResult(GPRClass, Op, Res, -1);

Expand Down
16 changes: 10 additions & 6 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1724,15 +1724,12 @@ friend class FEXCore::IR::PassManager;
}

template <typename F>
void CalculateFlags_ShiftVariable(OrderedNode *Shift, F&& CalculateFlags) {
// We are the ones calculating the deferred flags. Don't recurse!
InvalidateDeferredFlags();

void Calculate_ShiftVariable(OrderedNode *Shift, F&& Calculate) {
// RCR can call this with constants, so handle that without branching.
uint64_t Const;
if (IsValueConstant(WrapNode(Shift), &Const)) {
if (Const)
CalculateFlags();
Calculate();

return;
}
Expand All @@ -1749,7 +1746,7 @@ friend class FEXCore::IR::PassManager;
SetCurrentCodeBlock(SetBlock);
StartNewBlock();
{
CalculateFlags();
Calculate();
Jump(EndBlock);
}

Expand All @@ -1758,6 +1755,13 @@ friend class FEXCore::IR::PassManager;
PossiblySetNZCVBits |= OldSetNZCVBits;
}

template <typename F>
void CalculateFlags_ShiftVariable(OrderedNode *Shift, F&& CalculateFlags) {
// We are the ones calculating the deferred flags. Don't recurse!
InvalidateDeferredFlags();
Calculate_ShiftVariable(Shift, CalculateFlags);
}

/**
* @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs.
* @{ */
Expand Down
11 changes: 11 additions & 0 deletions FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,17 @@ bool ConstProp::ConstantPropagation(IREmitter *IREmit, const IRListView& Current
}
break;
}
case OP_NEG: {
auto Op = IROp->CW<IR::IROp_Neg>();
uint64_t Constant{};

if (IREmit->IsValueConstant(Op->Header.Args[0], &Constant)) {
uint64_t NewConstant = -Constant;
IREmit->ReplaceWithConstant(CodeNode, NewConstant);
Changed = true;
}
break;
}
case OP_LSHL: {
auto Op = IROp->CW<IR::IROp_Lshl>();
uint64_t Constant1{};
Expand Down
27 changes: 13 additions & 14 deletions unittests/InstructionCountCI/FlagM/FlagOpts.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,25 +197,24 @@
]
},
"Variable rotate-through-carry dead": {
"ExpectedInstructionCount": 16,
"ExpectedInstructionCount": 15,
"x86Insts": [
"rcr rax, cl",
"test rax, rdx"
],
"ExpectedArm64ASM": [
"mov x20, x4",
"lsr x21, x20, x5",
"cset w22, hs",
"and x23, x5, #0x3f",
"cbz x23, #+0x2c",
"neg x24, x23",
"lsl x25, x20, x24",
"orr x21, x21, x25, lsl #1",
"sub x23, x23, #0x1 (1)",
"lsr x20, x20, x23",
"rmif x20, #63, #nzCv",
"lsl x20, x22, x24",
"orr x4, x21, x20",
"and x20, x5, #0x3f",
"cbz x20, #+0x34",
"lsr x20, x4, x5",
"cset w21, hs",
"neg x22, x5",
"lsl x23, x4, x22",
"orr x20, x20, x23, lsl #1",
"sub x23, x5, #0x1 (1)",
"lsr x23, x4, x23",
"rmif x23, #63, #nzCv",
"lsl x21, x21, x22",
"orr x4, x20, x21",
"eor x20, x4, x4, lsr #1",
"rmif x20, #62, #nzcV",
"ands x26, x4, x6"
Expand Down
Loading

0 comments on commit cd9ffd2

Please sign in to comment.