Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Interleave loop pre-header with SWP prologue #236

Open
wants to merge 8 commits into
base: aie-public
Choose a base branch
from
2 changes: 2 additions & 0 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -807,6 +807,8 @@ void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
unsigned regioninstrs)
{
ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
CurrentTop = MachineBasicBlock::iterator();
CurrentBottom = MachineBasicBlock::iterator();

SchedImpl->initPolicy(begin, end, regioninstrs);

Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
RegionBegin = begin;
RegionEnd = end;
NumRegionInstrs = regioninstrs;

// Ensure all instructions can be given a SUnit without re-allocation.
clearDAG();
SUnits.reserve(NumRegionInstrs);
}

void ScheduleDAGInstrs::exitRegion() {
Expand Down Expand Up @@ -633,8 +637,6 @@ std::optional<unsigned> ScheduleDAGInstrs::initSUnit(MachineInstr &MI) {

void ScheduleDAGInstrs::initSUnits() {
ScheduleDAG::clearDAG();
// Prevent reallocations for performance.
SUnits.reserve(NumRegionInstrs);
// This loop creates SUnits for real instructions.
for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
initSUnit(MI);
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AIE/AIE2InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ include "AIE2GenFixupInstrInfo.td"
include "AIE2MultiSlotPseudoInstrInfo.td"

//Intrinsics
let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
let Itinerary = II_EVENT, hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
def EVENT : AIE2_event_inst_alu<(outs), (ins t02u:$val), "event", "$val">;
}
def : Pat<(int_aie2_event t02u:$val), (EVENT t02u:$val)>;
Expand Down Expand Up @@ -500,7 +500,7 @@ let Uses = [SP] in
def PseudoFI : Pseudo<(outs eP:$dst), (ins i32imm:$imm)>;

// NOPs for each VLIW Slot
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isSlotNOP = true in {
let Itinerary = II_NOP, hasSideEffects = 0, mayLoad = 0, mayStore = 0, isSlotNOP = true in {
def NOP : AIE2_nop_nop_inst_nop<(outs), (ins), "nop">;
def NOPA : AIE2_nop_lda_inst_lda<(outs), (ins), "nopa">;
def NOPB : AIE2_nop_ldb_inst_ldb<(outs), (ins), "nopb">;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AIE/AIE2Schedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def II_MOV_SS : InstrItinClass;
def II_MUL : InstrItinClass;
def II_NE : InstrItinClass;
def II_NEZ : InstrItinClass;
def II_NOP : InstrItinClass;
def II_OR : InstrItinClass;
def II_PADD : InstrItinClass;
def II_PADD_2D : InstrItinClass;
Expand Down Expand Up @@ -634,6 +635,7 @@ InstrItinData<II_MOVd6, [SimpleCycle<P_RM_PORT>, EmptyCycles<5>, SimpleCycle<RS_
InstrItinData<II_MUL, [EmptyCycles<1>, SimpleCycle<R_WX_PORT>], [2,1,1]>,
InstrItinData<II_NE, [InstrStage<1, [R_WX_PORT]>], [1,1,1,1]>,
InstrItinData<II_NEZ, [InstrStage<1, [R_WX_PORT]>], [1,1,1]>,
InstrItinData<II_NOP, [], []>,
InstrItinData<II_OR, [InstrStage<1, [R_WX_PORT]>], [1,1,1,1]>,
InstrItinData<II_PADD, [], [1,1,1]>,
InstrItinData<II_PADD_2D, [], [1,1,1,1]>,
Expand Down
22 changes: 2 additions & 20 deletions llvm/lib/Target/AIE/AIEBasePipelinerLoopInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@

#define DEBUG_TYPE "aie-pipeliner"
namespace llvm {
cl::opt<int> LoopMinTripCount(
"aie-loop-min-tripcount",
cl::desc("Minimum number of loop iterations (warning: applies to all loop"
" pipelining candidates)"),
cl::init(-1), cl::Hidden);
cl::opt<bool> TrackRegPressure(
"aie-pipeliner-track-regpressure",
cl::desc("Refuse SWP schedules likely to run into register spills"),
Expand Down Expand Up @@ -69,13 +64,7 @@ AIEBasePipelinerLoopInfo::AIEBasePipelinerLoopInfo(MachineInstr *EndLoop,
AIELoopUtils::getMinTripCount(*LoopBlock);
if (ParsedMinTripCount) {
MinTripCount = *ParsedMinTripCount;
LLVM_DEBUG(dbgs() << "PLI: MinTripCount from pragma = " << MinTripCount
<< "\n");
}

if (LoopMinTripCount > MinTripCount) {
MinTripCount = LoopMinTripCount;
LLVM_DEBUG(dbgs() << "PLI: MinTripCount from CL option = " << MinTripCount
LLVM_DEBUG(dbgs() << "PLI: MinTripCount from pragma/CL = " << MinTripCount
<< "\n");
}
}
Expand Down Expand Up @@ -682,18 +671,11 @@ class ZeroOverheadLoop : public AIEBasePipelinerLoopInfo {
};

ZeroOverheadLoop::Assessment ZeroOverheadLoop::accept(MachineInstr *EndLoop) {
// We are using LoopMinTripCount below just for testing purposes.
// For MIR test cases without IR, we can't encode loop-related metadata.
if (!MinTripCount && LoopMinTripCount <= 0) {
if (!MinTripCount) {
LLVM_DEBUG(dbgs() << "Unbounded loop detected!\n");
return Assessment::UnboundedLoop;
}

// Overwrite MinTripCount.
if (LoopMinTripCount > 0) {
MinTripCount = LoopMinTripCount;
}

if (MinTripCount <= 1) {
LLVM_DEBUG(dbgs() << "Not interesting MinTripCount (<=1)!\n");
return Assessment::TooLowMinTripCount;
Expand Down
75 changes: 56 additions & 19 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,27 +275,14 @@ class RegionEndEdges : public ScheduleDAGMutation {
assert(EdgeLatency < DelaySlots);
EdgeLatency = DelaySlots + 1;
}

// Between writing Registers (lc, le, ls) and the end of the loop,
// there must be a distance of 112 bytes in terms of PM addresses.
// 112 bytes correspond to 7 fully-expanded 128-bit instructions and
// hence adding a latency of 8 from LoopStart to the ExitSU.
// We can subtract the number of bundles that interblock pushed into
// BottomInsert
// FIXME: this holds as long as we insert them unconditionally. If we
// integrate them with the bottom region, we just need to keep 8 away
// from ExitSU
if (TII->isZeroOverheadLoopSetupInstr(MI)) {
unsigned PatchCycles = 8;
if (DAG->getBB()) {
auto *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto &InterBlock = Scheduler->getInterBlock();
unsigned InsertedCycles =
InterBlock.getBlockState(DAG->getBB()).BottomInsert.size();
PatchCycles =
PatchCycles >= InsertedCycles ? PatchCycles - InsertedCycles : 0;
}
EdgeLatency = std::max(EdgeLatency, PatchCycles);
const unsigned ZOLDistance = 8;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the future: this magic 8 could be retrieved from TII as well.

EdgeLatency = std::max(EdgeLatency, ZOLDistance);
}

ExitDep.setLatency(EdgeLatency);
Expand All @@ -318,6 +305,56 @@ class RegionEndEdges : public ScheduleDAGMutation {
};
};

/// This Mutator is responsible for emitting "fixed" SUnits at the top or bottom
/// of the region. These special SUnits require a specific cycle and cannot be
/// placed freely by the scheduler.
///
/// Here, these special SUnits get created from Region::top_fixed_instrs() or
/// Region::bot_fixed_instrs(), and dependencies are created between "free" and
/// "fixed" SUnits.
class EmitFixedSUnits : public ScheduleDAGMutation {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could probably add a top-level comment for that mutator

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: for the future, we could have a high-level description of the mutator.

public:
void apply(ScheduleDAGInstrs *DAG) override {
AIEPostRASchedStrategy *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);
auto *ItinData = DAG->MF.getSubtarget().getInstrItineraryData();
const BlockState &BS =
Scheduler->getInterBlock().getBlockState(DAG->getBB());
const Region &CurRegion = BS.getCurrentRegion();

// First, create SUnits for all "fixed" instructions
unsigned DistToExitSU = 0;
for (MachineInstr &MI : reverse(CurRegion.bot_fixed_instrs())) {
Scheduler->addFixedSUnit(MI, /*IsTop=*/false, DistToExitSU);
++DistToExitSU;
}
DAG->makeMaps();

// Then, create dependencies between "free" and "fixed" instructions
auto IsFreeSU = [Scheduler](const SUnit &SU) {
return Scheduler->isFreeSU(SU);
};
ArrayRef<AIE::MachineBundle> BotFixedBundles =
CurRegion.getBotFixedBundles();
for (SUnit &FreeSU : make_filter_range(DAG->SUnits, IsFreeSU)) {
const MachineInstr &MI = *FreeSU.getInstr();
MachineInstr *FixedDepMI =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

super nit:

To help readability, we could extend with overloaded cast operators:

 struct InstrAndCycle {
   MachineInstr *MI = nullptr;
   int Cycle;
+  operator MachineInstr *() const { return MI; }
+  operator int() const { return Cycle; }
 };

Then we can just use:

MachineInstr *FixedDepMI = AIE::findEarliestRef(MI, BotFixedBundles, BotFixedBundles.size());

And also:

Earliest = findEarliestRef(MI, TopBundles, Earliest);

AIE::findEarliestRef(MI, BotFixedBundles, BotFixedBundles.size()).MI;
if (!FixedDepMI)
continue;

SUnit *FixedDepSU =
DAG->getSUnit(&*getBundleStart(FixedDepMI->getIterator()));
assert(FixedDepSU && "Fixed Bundle has no corresponding SU.");
SDep Dep(&FreeSU, SDep::Artificial);
Dep.setLatency(
AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true));
FixedDepSU->addPred(Dep, /*Required=*/true);
}
}
};

/// Collect all "weak" edges in a separate vector. This allows modifying
/// \p SU.Preds without invalidating iterators.
SmallVector<SDep, 4> getWeakPreds(SUnit &SU) {
Expand Down Expand Up @@ -543,11 +580,10 @@ class WAWEdges : public ScheduleDAGMutation {
LiveRegs.init(*TRI);
bool AddReservedRegs = true;
if (Scheduler) {
assert(!Scheduler->doMBBSchedRegionsTopDown());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK: this assert is specific for the prologue handling, and the code needs to be extended for epilogue handling.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, WAW handling could run on any region. Here I'm making sure that BS.getBottom() actually represents the bottom region.

MachineBasicBlock *MBB = DAG->getBB();
const BlockState &BS = Scheduler->getInterBlock().getBlockState(MBB);
auto Region = BS.getCurrentRegion();
auto BottomRegion = BS.getBottom();
if (*Region.begin() == *BottomRegion.begin()) {
if (&BS.getCurrentRegion() == &BS.getBottom()) {
Copy link
Collaborator

@martien-de-jong martien-de-jong Nov 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK: this a relic of the pre-GatheringRegions era

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, the DAGMutator can run for all types of regions. For non-loop blocks, we do not pre-gather regions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. now you have me worried about reallocation of the region array. pre-gather would imply pre-allocation, making the test 'obviously correct'

// If the region is bottom region, liveouts of region are same as
// liveouts of the MBB
for (const MCPhysReg Reg : BS.LiveOuts) {
Expand Down Expand Up @@ -665,6 +701,7 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) {
Mutations.emplace_back(std::make_unique<MemoryEdges>());
Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
Mutations.emplace_back(std::make_unique<BiasDepth>());
Mutations.emplace_back(std::make_unique<EmitFixedSUnits>());
gbossu marked this conversation as resolved.
Show resolved Hide resolved
}
return Mutations;
}
Expand Down
30 changes: 28 additions & 2 deletions llvm/lib/Target/AIE/AIEBundle.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@

#include "AIEBaseSubtarget.h"
#include "MCTargetDesc/AIEMCFormats.h"
#include "llvm-c/DebugInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/TargetOpcodes.h"

#include <unordered_map>
Expand Down Expand Up @@ -78,6 +76,10 @@ template <class I> class Bundle {
return true;
}

if (InstOpCode == TargetOpcode::BUNDLE) {
return !BundleRoot;
}

// if we have a standalone bundle, we can't add anything.
if (isStandalone())
return false;
Expand All @@ -103,6 +105,16 @@ template <class I> class Bundle {
MetaInstrs.push_back(Instr);
return;
}

// Keep track of BUNDLE instructions. They need to be cleaned up when
// de-bundling before re-bundling. See applyBundles()
if (Instr->getOpcode() == TargetOpcode::BUNDLE) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

super nit: Instr->isBundle()

assert(!BundleRoot &&
"AIE::Bundle already has a root BUNDLE instruction");
BundleRoot = Instr;
return;
}

// Check if the pre-condition is ensured
assert((!ComputeSlots || !isStandalone()) &&
"Tried to add an instruction in a standalone Bundle");
Expand Down Expand Up @@ -142,6 +154,7 @@ template <class I> class Bundle {
Instrs.clear();
MetaInstrs.clear();
SlotMap.clear();
BundleRoot = nullptr;
}

/// Check if empty
Expand Down Expand Up @@ -192,6 +205,15 @@ template <class I> class Bundle {
return false;
}

/// Erase the BUNDLE root instruction from its parent MBB.
/// This does not remove the instructions within the BUNDLE, only the root.
void eraseRootFromBlock() {
if (BundleRoot) {
BundleRoot->eraseFromBundle();
BundleRoot = nullptr;
}
}

bool isNOPBundle() const {
const VLIWFormat *Format = getFormatOrNull();
assert(Format);
Expand Down Expand Up @@ -228,6 +250,10 @@ template <class I> class Bundle {

// Contained meta instructions (These will end up after the bundle)
std::vector<I *> MetaInstrs;

private:
/// A root BUNDLE instruction if it exists.
I *BundleRoot = nullptr;
};

template <class I> bool operator==(const Bundle<I> &B1, const Bundle<I> &B2) {
Expand Down
Loading
Loading