Skip to content

Commit

Permalink
[VPlan] Dispatch to multiple exit blocks via middle blocks. llvm#112138
Browse files Browse the repository at this point in the history
  • Loading branch information
david-arm committed Dec 11, 2024
1 parent 323bedd commit 0807837
Show file tree
Hide file tree
Showing 13 changed files with 609 additions and 80 deletions.
13 changes: 13 additions & 0 deletions llvm/docs/Vectorizers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,19 @@ small trip counts.

.. image:: epilogue-vectorization-cfg.png

Early Exit Vectorization
^^^^^^^^^^^^^^^^^^^^^^^^

When vectorizing a loop with a single early exit, the loop blocks following the
early exit are predicated and the vector loop will always exit via the latch.
If the early exit has been taken, the vector loop's successor block
(``middle.split`` below) branches to the early exit block. Otherwise
``middle.block`` selects between the exit block from the latch or the scalar
remainder loop.

.. image:: vplan-early-exit.png


Performance
-----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ class LoopVectorizationLegality {

/// Returns the uncountable early exiting block.
BasicBlock *getUncountableEarlyExitingBlock() const {
if (!HasUncountableEarlyExit) {
assert(getUncountableExitingBlocks().empty() &&
"Expected no uncountable exiting blocks");
return nullptr;
}
assert(getUncountableExitingBlocks().size() == 1 &&
"Expected only a single uncountable exiting block");
return getUncountableExitingBlocks()[0];
Expand Down
16 changes: 14 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}

bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
// When vectorizing early exits, create predicates for the latch block only.
// The early exiting block must be a direct predecessor of the latch at the
// moment.
BasicBlock *Latch = TheLoop->getLoopLatch();
if (hasUncountableEarlyExit()) {
assert(
is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
"Uncountable exiting block must be a direct predecessor of latch");
return BB == Latch;
}
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}

Expand Down Expand Up @@ -1788,13 +1798,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

HasUncountableEarlyExit = false;
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
HasUncountableEarlyExit = true;
if (!isVectorizableEarlyExitLoop()) {
UncountableExitingBlocks.clear();
HasUncountableEarlyExit = false;
if (DoExtraAnalysis)
Result = false;
else
return false;
} else
HasUncountableEarlyExit = true;
}
}

// Go over each instruction and look at memory deps.
Expand Down
114 changes: 75 additions & 39 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));

static cl::opt<bool> EnableEarlyExitVectorization(
"enable-early-exit-vectorization", cl::init(false), cl::Hidden,
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));

// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
Expand Down Expand Up @@ -1382,9 +1387,10 @@ class LoopVectorizationCostModel {
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
// If we might exit from anywhere but the latch and early exit vectorization
// is disabled, we must run the exiting iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
"from latch block\n");
return true;
Expand Down Expand Up @@ -3656,10 +3662,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// Start with the conditional branches exiting the loop. If the branch
// condition is an instruction contained in the loop that is only used by the
// branch, it is uniform.
// branch, it is uniform. Note conditions from uncountable early exits are not
// uniform.
SmallVector<BasicBlock *> Exiting;
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
continue;
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
AddToWorklistIfAllowed(Cmp);
Expand Down Expand Up @@ -8239,8 +8248,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {

// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
if (OrigLoop->isLoopExiting(Src))
// adding uses of an otherwise potentially dead instruction unless we are
// vectorizing a loop with uncountable exits. In that case, we always
// materialize the mask.
if (OrigLoop->isLoopExiting(Src) &&
Src != Legal->getUncountableEarlyExitingBlock())
return EdgeMaskCache[Edge] = SrcMask;

VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
Expand Down Expand Up @@ -8931,50 +8943,58 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
BasicBlock *ExitingBB = find_singleton<BasicBlock>(
to_vector(predecessors(ExitBB)),
[OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
return OrigLoop->contains(Pred) ? Pred : nullptr;
});
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
if (!ExitIRI)
continue;
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
if (!ExitPhi)
break;
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
(isa<Instruction>(IncomingValue) &&
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
any_of(IncomingValue->users(), [&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
})))
continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
if (PredVPBB != MiddleVPBB) {
SmallVector<BasicBlock *> ExitingBlocks;
OrigLoop->getExitingBlocks(ExitingBlocks);
assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
: ExitingBlocks[0];
}
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
(isa<Instruction>(IncomingValue) &&
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
any_of(IncomingValue->users(), [&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
}))) {
if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
continue;
}
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
}
}
return ExitUsersToFix;
}

// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated.
static void
// ExitUsersToFix if needed and their operands are updated. Returns true if all
// exit users can be handled, otherwise return false.
static bool
addUsersInExitBlocks(VPlan &Plan,
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
if (ExitUsersToFix.empty())
return;
return true;

auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
Expand All @@ -8988,14 +9008,18 @@ addUsersInExitBlocks(VPlan &Plan,
if (V->isLiveIn())
continue;

assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
"Exit value not handled yet for this edge.");
// Currently only live-ins can be used by exit values from blocks not
// exiting via the vector latch through to the middle block.
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
return false;

LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
{V, Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Ctx, 32), 1))});
ExitIRI->setOperand(0, Ext);
}
return true;
}

/// Handle users in the exit block for first order reductions in the original
Expand Down Expand Up @@ -9268,11 +9292,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();

if (auto *UncountableExitingBlock =
Legal->getUncountableEarlyExitingBlock()) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlocks(*Plan, ExitUsersToFix);
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
reportVectorizationFailure(
"Some exit values in loop with uncountable exit not supported yet",
"Some exit values in loop with uncountable exit not supported yet",
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
return nullptr;
}

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
Expand Down Expand Up @@ -10138,12 +10174,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}

if (LVL.hasUncountableEarlyExit()) {
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
"early exit is not yet supported",
"early exit is not enabled",
"Auto-vectorization of loops with uncountable "
"early exit is not yet supported",
"UncountableEarlyExitLoopsUnsupported", ORE, L);
"early exit is not enabled",
"UncountableEarlyExitLoopsDisabled", ORE, L);
return false;
}

Expand Down
12 changes: 4 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -861,14 +861,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);

// Create SCEV and VPValue for the trip count.

// Currently only loops with countable exits are vectorized, but calling
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
// uncountable exits whilst also ensuring the symbolic maximum and known
// back-edge taken count remain identical for loops with countable exits.
// We use the symbolic max backedge-taken-count, which works also when
// vectorizing loops with uncountable early exits.
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
"Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
Expand Down Expand Up @@ -903,7 +899,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// 2) If we require a scalar epilogue, there is no conditional branch as
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
// The connection order corresponds to the operands of the conditional branch.
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
Expand Down
15 changes: 13 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,14 @@ class VPBlockBase {
/// Remove all the successors of this block.
void clearSuccessors() { Successors.clear(); }

/// Swap successors of the block. The block must have exactly 2 successors.
// TODO: This should be part of introducing conditional branch recipes rather
// than being independent.
void swapSuccessors() {
assert(Successors.size() == 2 && "must have 2 successors to swap");
std::swap(Successors[0], Successors[1]);
}

/// The method which generates the output IR that correspond to this
/// VPBlockBase, thereby "executing" the VPlan.
virtual void execute(VPTransformState *State) = 0;
Expand Down Expand Up @@ -1232,6 +1240,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
// Returns a scalar boolean value, which is true if any lane of its single
// operand is true.
AnyOf,
};

private:
Expand Down Expand Up @@ -3884,10 +3895,10 @@ class VPlan {
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch.
const VPBasicBlock *getMiddleBlock() const {
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
VPBasicBlock *getMiddleBlock() {
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}

/// Return the VPBasicBlock for the preheader of the scalar loop.
Expand Down
Loading

0 comments on commit 0807837

Please sign in to comment.