Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Add 3-opt implementation for improving upon RPO-based layout #103450

Merged
merged 48 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5813e1a
Implement k-opt for non-EH methods
amanasifkhalid Jun 13, 2024
3f4e749
Enable for methods with EH
amanasifkhalid Jun 13, 2024
da291af
Merge branch 'main' into k-opt-layout
amanasifkhalid Jun 13, 2024
699714b
Add comments
amanasifkhalid Jun 13, 2024
0849f76
Style
amanasifkhalid Jun 13, 2024
e223927
Merge branch 'main' into k-opt-layout
amanasifkhalid Jun 13, 2024
044b332
Only one iteration for now; try to reduce TP cost
amanasifkhalid Jun 13, 2024
f0e7f6b
Remove initial layout cost calculation
amanasifkhalid Jun 13, 2024
41efb9b
Conditionalize EH checks
amanasifkhalid Jun 13, 2024
5b7a85e
Merge from main
amanasifkhalid Sep 18, 2024
94c2272
Add priority queue impl
amanasifkhalid Sep 18, 2024
0081d9b
wip
amanasifkhalid Sep 20, 2024
4700e65
Fix lambda capture
amanasifkhalid Sep 20, 2024
cabacf9
Merge branch 'main' into k-opt-layout
amanasifkhalid Sep 23, 2024
ebb7e6a
Consider forward conditional jumps
amanasifkhalid Sep 24, 2024
4eb4471
Remove debug print
amanasifkhalid Sep 24, 2024
0175b18
Consider backward jumps; find more initial candidates
amanasifkhalid Sep 25, 2024
bbc28df
Revert irrelevant changes
amanasifkhalid Sep 25, 2024
2e507be
Missed a few
amanasifkhalid Sep 25, 2024
9ed6452
Add JitDump check
amanasifkhalid Sep 25, 2024
40fc6bc
Add more candidate edges when reordering
amanasifkhalid Sep 25, 2024
0b8e830
Merge branch 'main' into k-opt-layout
amanasifkhalid Sep 25, 2024
a3b7392
Don't add duplicate edges to cutPoints
amanasifkhalid Sep 26, 2024
d468a8a
Consider each candidate edge at most once
amanasifkhalid Sep 27, 2024
9cc1860
Merge from main
amanasifkhalid Oct 17, 2024
3ee5c42
Don't factor cold branches into cost calculatioN
amanasifkhalid Oct 17, 2024
5c134b5
Merge from main
amanasifkhalid Oct 21, 2024
d6fea98
Remove used candidates set
amanasifkhalid Oct 21, 2024
872b431
Revert "Remove used candidates set"
amanasifkhalid Oct 21, 2024
fadaeba
Tweak cold block finding
amanasifkhalid Oct 21, 2024
e17b169
Refactor into ThreeOptLayout class
amanasifkhalid Oct 25, 2024
eea003f
Reorder EH regions
amanasifkhalid Oct 28, 2024
96ef576
Fix completely cold try regions (clean replay)
amanasifkhalid Oct 28, 2024
8d95f49
Small cleanup
amanasifkhalid Oct 29, 2024
5388e9e
Add currEHRegion member
amanasifkhalid Oct 29, 2024
5e90c89
Only move blocks within same region after 3-opt
amanasifkhalid Oct 29, 2024
1cecc72
Merge from main
amanasifkhalid Oct 29, 2024
b7a0a03
Cleanup
amanasifkhalid Oct 29, 2024
6ad7541
Comments
amanasifkhalid Oct 29, 2024
a26be54
EdgeCmp tie-breaker
amanasifkhalid Oct 30, 2024
a1b8661
Comment feedback; replace 'usedCandidates' set with FlowEdge flag
amanasifkhalid Oct 30, 2024
ebbb973
Reframe cost model as maximal score problem
amanasifkhalid Oct 31, 2024
7d9734f
Small refactor
amanasifkhalid Oct 31, 2024
a2b3e16
Simplify adding new candidate edges
amanasifkhalid Oct 31, 2024
88d5389
Guess number of hot blocks when allocating arrays
amanasifkhalid Nov 4, 2024
4ab9f17
Skip reordering if too few blocks
amanasifkhalid Nov 4, 2024
ed39074
Add another check for too few blocks
amanasifkhalid Nov 4, 2024
f2eb773
JITDUMP msg
amanasifkhalid Nov 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/coreclr/jit/arraystack.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
template <class T>
class ArrayStack
{
static const int builtinSize = 8;

public:
static constexpr int builtinSize = 8;

explicit ArrayStack(CompAllocator alloc, int initialCapacity = builtinSize)
: m_alloc(alloc)
{
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6147,6 +6147,9 @@ class Compiler
void fgDoReversePostOrderLayout();
void fgMoveColdBlocks();

template <bool hasEH>
void fgSearchImprovedLayout();

template <bool hasEH>
void fgMoveBackwardJumpsToSuccessors();

Expand Down
283 changes: 265 additions & 18 deletions src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3421,6 +3421,15 @@ bool Compiler::fgReorderBlocks(bool useProfile)
fgDoReversePostOrderLayout();
fgMoveColdBlocks();

if (compHndBBtabCount != 0)
{
fgSearchImprovedLayout<true>();
}
else
{
fgSearchImprovedLayout<false>();
}

// Renumber blocks to facilitate LSRA's order of block visitation
// TODO: Consider removing this, and using traversal order in lSRA
//
Expand Down Expand Up @@ -4649,6 +4658,20 @@ void Compiler::fgMoveBackwardJumpsToSuccessors()
}
}

struct CallFinallyPair
{
BasicBlock* callFinally;
BasicBlock* callFinallyRet;

// Constructor provided so we can call ArrayStack::Emplace
//
CallFinallyPair(BasicBlock* first, BasicBlock* second)
: callFinally(first)
, callFinallyRet(second)
{
}
};

//-----------------------------------------------------------------------------
// fgDoReversePostOrderLayout: Reorder blocks using a greedy RPO traversal.
//
Expand Down Expand Up @@ -4677,8 +4700,12 @@ void Compiler::fgDoReversePostOrderLayout()
{
BasicBlock* const block = dfsTree->GetPostOrder(i);
BasicBlock* const blockToMove = dfsTree->GetPostOrder(i - 1);
fgUnlinkBlock(blockToMove);
fgInsertBBafter(block, blockToMove);

if (!block->NextIs(blockToMove))
{
fgUnlinkBlock(blockToMove);
fgInsertBBafter(block, blockToMove);
}
}

// The RPO established a good base layout, but in some cases, it might produce a subpar layout for loops.
Expand Down Expand Up @@ -4709,20 +4736,6 @@ void Compiler::fgDoReversePostOrderLayout()

// The RPO will break up call-finally pairs, so save them before re-ordering
//
struct CallFinallyPair
{
BasicBlock* callFinally;
BasicBlock* callFinallyRet;

// Constructor provided so we can call ArrayStack::Emplace
//
CallFinallyPair(BasicBlock* first, BasicBlock* second)
: callFinally(first)
, callFinallyRet(second)
{
}
};

ArrayStack<CallFinallyPair> callFinallyPairs(getAllocator());

for (EHblkDsc* const HBtab : EHClauses(this))
Expand Down Expand Up @@ -4761,12 +4774,16 @@ void Compiler::fgDoReversePostOrderLayout()
continue;
}

fgUnlinkBlock(blockToMove);
fgInsertBBafter(block, blockToMove);
if (!block->NextIs(blockToMove))
{
fgUnlinkBlock(blockToMove);
fgInsertBBafter(block, blockToMove);
}
}
}

// Fix up call-finally pairs
// (We assume the RPO will mess these up, so don't bother checking if the blocks are still adjacent)
//
for (int i = 0; i < callFinallyPairs.Height(); i++)
{
Expand Down Expand Up @@ -5106,6 +5123,236 @@ void Compiler::fgMoveColdBlocks()
ehUpdateTryLasts<decltype(getTryLast), decltype(setTryLast)>(getTryLast, setTryLast);
}

//-----------------------------------------------------------------------------
// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method:
// - Identify a subset of "interesting" (not cold, has branches, etc.) blocks to move
// - Partition this set into three segments: S1 - S2 - S3
// - Evaluate cost of swapped layout: S1 - S3 - S2
// - If the cost improves, keep this layout
// - Repeat for a certain number of iterations, or until no improvements are made
//
// Template parameters:
// hasEH - If true, method has EH regions, so check that we don't try to move blocks in different regions
//
template <bool hasEH>
void Compiler::fgSearchImprovedLayout()
{
#ifdef DEBUG
if (verbose)
{
printf("*************** In fgSearchImprovedLayout()\n");

printf("\nInitial BasicBlocks");
fgDispBasicBlocks(verboseTrees);
printf("\n");
}
#endif // DEBUG

BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this));
BasicBlock* startBlock = nullptr;

// Find the first block that doesn't fall into its hottest successor.
// This will be our first "interesting" block.
//
for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction()))
{
// Ignore try/handler blocks
if (hasEH && (block->hasTryIndex() || block->hasHndIndex()))
{
continue;
}

BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum);
FlowEdge* hottestSuccEdge = nullptr;

for (FlowEdge* const succEdge : block->SuccEdges(this))
{
BasicBlock* const succ = succEdge->getDestinationBlock();

// Ignore try/handler successors
//
if (hasEH && (succ->hasTryIndex() || succ->hasHndIndex()))
{
continue;
}

const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum);

if (isForwardJump &&
((hottestSuccEdge == nullptr) || (succEdge->getLikelihood() > hottestSuccEdge->getLikelihood())))
{
hottestSuccEdge = succEdge;
}
}

if ((hottestSuccEdge != nullptr) && !block->NextIs(hottestSuccEdge->getDestinationBlock()))
{
// We found the first "interesting" block that doesn't fall into its hottest successor
//
startBlock = block;
break;
}
}

if (startBlock == nullptr)
{
JITDUMP("\nSkipping reordering");
return;
}

// blockVector will contain the set of interesting blocks to move.
// tempBlockVector will assist with moving segments of interesting blocks.
//
BasicBlock** blockVector = new BasicBlock*[fgBBNumMax];
BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax];
unsigned blockCount = 0;
ArrayStack<CallFinallyPair> callFinallyPairs(getAllocator(), hasEH ? ArrayStack<CallFinallyPair>::builtinSize : 0);

for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction()))
{
// Don't consider blocks in EH regions
//
if (block->hasTryIndex() || block->hasHndIndex())
{
continue;
}

// We've reached the cold section of the main method body;
// nothing is interesting at this point
//
if (block->isRunRarely())
{
break;
}

blockVector[blockCount] = block;
tempBlockVector[blockCount++] = block;

if (hasEH && block->isBBCallFinallyPair())
{
callFinallyPairs.Emplace(block, block->Next());
}
}

if (blockCount < 3)
{
JITDUMP("\nNot enough interesting blocks; skipping reordering");
return;
}

JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum);

auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t {
assert(block != nullptr);

if ((block->NumSucc() == 0) || (next == nullptr))
{
return 0.0;
}

const weight_t cost = block->bbWeight;

for (FlowEdge* const edge : block->SuccEdges())
{
if (edge->getDestinationBlock() == next)
{
return cost - edge->getLikelyWeight();
}
}

return cost;
};

// finalBlock is the first block after the set of interesting blocks.
// We will need to keep track of it to compute the cost of creating/breaking fallthrough into it.
// finalBlock can be null.
//
BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next();
bool improvedLayout = true;
constexpr unsigned maxIter = 5; // TODO: Reconsider?

for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++)
{
JITDUMP("\n\n--Iteration %d--", (numIter + 1));
improvedLayout = false;
BasicBlock* const exitBlock = blockVector[blockCount - 1];

for (unsigned i = 1; i < (blockCount - 1); i++)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the root of the TP cost is here -- we want to avoid having to search for possible cut points.

One approach is to just pick randomly, but I think we can do better for now. Roughly speaking in the pass above we should find all blocks that are either not just before their optimal successor and/or not just after their optimal successor.

We can rank these by the difference in the current vs optimal score. Then greedily pick the worst, that gives the first cut point. For the second cut point you can pick the best pred for the first cut point's current next block, or the best succ for the current pred of the first cut point's ideal successor. That is, if we have

S ~~~~ 1|2 ~~~ 3|4 ~~~ 5|6 ~~~ E

1's ideal succ is 4

reordering is

S ~~~~ 1|4 ~~~ 5|2 ~~~ 3|6 ~~~ E

So we either try and find a 5 which is the ideal pred of 2, or a 6 which is the ideal succ of 3.

Failing that we might pick some other block that is not currently followed by its ideal succ.

So one idea is to keep 3 values for each block: its min score, current score, and best score (lower is better). Order the blocks by current-min. Pick of the best as the first split, and then see if any of the next few provide a good second split.

Likely though this ends up needing a priority queue or similar as once we accept an arrangement we need to update some of the costings...

{
BasicBlock* const blockI = blockVector[i];
BasicBlock* const blockIPrev = blockVector[i - 1];

for (unsigned j = i + 1; j < blockCount; j++)
{
// Evaluate the current partition at (i,j)
// S1: 0 ~ i-1
// S2: i ~ j-1
// S3: j ~ exitBlock

BasicBlock* const blockJ = blockVector[j];
BasicBlock* const blockJPrev = blockVector[j - 1];

const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) +
evaluateCost(exitBlock, finalBlock);
const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) +
evaluateCost(blockJPrev, finalBlock);

if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001))
{
JITDUMP("\nFound better layout by partitioning at i=%d, j=%d", i, j);
JITDUMP("\nOld score: %f, New score: %f", oldScore, newScore);
const unsigned part1Size = i;
const unsigned part2Size = j - i;
const unsigned part3Size = blockCount - j;

memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size);
memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size,
sizeof(BasicBlock*) * part3Size);
memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size,
sizeof(BasicBlock*) * part2Size);

std::swap(blockVector, tempBlockVector);
improvedLayout = true;
break;
}
}

if (improvedLayout)
{
break;
}
}
}

// Rearrange blocks
//
for (unsigned i = 1; i < blockCount; i++)
{
BasicBlock* const block = blockVector[i - 1];
BasicBlock* const next = blockVector[i];
assert(BasicBlock::sameEHRegion(block, next));

if (!block->NextIs(next))
{
fgUnlinkBlock(next);
fgInsertBBafter(block, next);
}
}

// Fix call-finally pairs
//
for (int i = 0; hasEH && (i < callFinallyPairs.Height()); i++)
{
const CallFinallyPair& pair = callFinallyPairs.BottomRef(i);

if (!pair.callFinally->NextIs(pair.callFinallyRet))
{
fgUnlinkBlock(pair.callFinallyRet);
fgInsertBBafter(pair.callFinally, pair.callFinallyRet);
}
}
}

//-------------------------------------------------------------
// ehUpdateTryLasts: Iterates EH descriptors, updating each try region's
// end block as determined by getTryLast.
Expand Down
Loading