-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Remove loop region in optimizeForVFAndUF. #108378
base: main
Are you sure you want to change the base?
[VPlan] Remove loop region in optimizeForVFAndUF. #108378
Conversation
@llvm/pr-subscribers-backend-systemz @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesUpdate optimizeForVFAndUF to completely remove the vector loop region when possible. At the moment, we cannot remove the region if it contains
Both cases can be addressed by more explicit modeling. The patch also includes a number of updates to allow executing VPlans without a vector loop region which can be split off. Patch is 65.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108378.diff 13 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3b6b154b9660cf..81e6d893400f40 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2392,7 +2392,10 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
AC->registerAssumption(II);
// End if-block.
- bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
+ bool IfPredicateInstr =
+ RepRecipe->getParent()->getParent()
+ ? RepRecipe->getParent()->getParent()->isReplicator()
+ : false;
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
@@ -2954,6 +2957,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
+ if (!isa<VPRegionBlock>(State.Plan->getEntry()->getSingleSuccessor()))
+ return;
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
@@ -7598,8 +7603,8 @@ LoopVectorizationPlanner::executePlan(
// 2.5 Collect reduction resume values.
DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
- auto *ExitVPBB =
- cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ auto *ExitVPBB = cast<VPBasicBlock>(
+ BestVPlan.getEntry()->getSingleSuccessor()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
createAndCollectMergePhiForReduction(
dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
@@ -7615,24 +7620,26 @@ LoopVectorizationPlanner::executePlan(
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
- VPBasicBlock *HeaderVPBB =
- BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
- Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
- if (VectorizedLoopID)
- L->setLoopID(*VectorizedLoopID);
- else {
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE);
- Hints.setAlreadyVectorized();
+ if (auto *R =
+ dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
+ VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
+ Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
+ if (VectorizedLoopID)
+ L->setLoopID(*VectorizedLoopID);
+ else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
+
+ LoopVectorizeHints Hints(L, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+ addRuntimeUnrollDisableMetaData(L);
}
- TargetTransformInfo::UnrollingPreferences UP;
- TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
- if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
- addRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -9468,7 +9475,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
- assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
+ assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&
+ "IV didn't need transforming?");
State.set(this, DerivedIV, VPIteration(0, 0));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 276e90c7670a44..db7a6c8ea7e1f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -226,8 +226,7 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
InnerLoopVectorizer *ILV, VPlan *Plan,
LLVMContext &Ctx)
: VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr),
- TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
+ LVer(nullptr), TypeAnalysis(IntegerType::get(Ctx, 64), Ctx) {}
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
if (Def->isLiveIn())
@@ -278,8 +277,8 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (SafeToHoist) {
- BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
- Plan->getVectorLoopRegion()->getSinglePredecessor())];
+ BasicBlock *LoopVectorPreHeader =
+ CFG.VPBB2IRBB[cast<VPBasicBlock>(Plan->getEntry())];
if (LoopVectorPreHeader)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
}
@@ -934,7 +933,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
- assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
+ // assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
if (VF.getNumUsers()) {
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
VF.setUnderlyingValue(RuntimeVF);
@@ -1005,8 +1004,13 @@ void VPlan::execute(VPTransformState *State) {
// skeleton creation, so we can only create the VPIRBasicBlocks now during
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ VPBlockBase *Leaf = nullptr;
+ for (VPBlockBase *VPB : vp_depth_first_shallow(getEntry()))
+ if (VPB->getNumSuccessors() == 0) {
+ Leaf = VPB;
+ break;
+ }
+ VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(Leaf->getSinglePredecessor());
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1034,64 +1038,66 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
Block->execute(State);
- VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
- BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
-
- // Fix the latch value of canonical, reduction and first-order recurrences
- // phis in the vector loop.
- VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- // Skip phi-like recipes that generate their backedege values themselves.
- if (isa<VPWidenPHIRecipe>(&R))
- continue;
-
- if (isa<VPWidenPointerInductionRecipe>(&R) ||
- isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- PHINode *Phi = nullptr;
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
- } else {
- auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
- assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
- "recipe generating only scalars should have been replaced");
- auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
- Phi = cast<PHINode>(GEP->getPointerOperand());
- }
-
- Phi->setIncomingBlock(1, VectorLatchBB);
+ if (auto *LoopRegion =
+ dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor())) {
+ VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+ BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+
+ // Fix the latch value of canonical, reduction and first-order recurrences
+ // phis in the vector loop.
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+ for (VPRecipeBase &R : Header->phis()) {
+ // Skip phi-like recipes that generate their backedege values themselves.
+ if (isa<VPWidenPHIRecipe>(&R))
+ continue;
- // Move the last step to the end of the latch block. This ensures
- // consistent placement of all induction updates.
- Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
- continue;
- }
+ if (isa<VPWidenPointerInductionRecipe>(&R) ||
+ isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ PHINode *Phi = nullptr;
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
+ } else {
+ auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+ assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+ "recipe generating only scalars should have been replaced");
+ auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
+ Phi = cast<PHINode>(GEP->getPointerOperand());
+ }
+
+ Phi->setIncomingBlock(1, VectorLatchBB);
+
+ // Move the last step to the end of the latch block. This ensures
+ // consistent placement of all induction updates.
+ Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+ continue;
+ }
- auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
- // For canonical IV, first-order recurrences and in-order reduction phis,
- // only a single part is generated, which provides the last part from the
- // previous iteration. For non-ordered reductions all UF parts are
- // generated.
- bool SinglePartNeeded =
- isa<VPCanonicalIVPHIRecipe>(PhiR) ||
- isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
- bool NeedsScalar =
- isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
- unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
-
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *Phi = State->get(PhiR, Part, NeedsScalar);
- Value *Val =
- State->get(PhiR->getBackedgeValue(),
- SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
- cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+ auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+ // For canonical IV, first-order recurrences and in-order reduction phis,
+ // only a single part is generated, which provides the last part from the
+ // previous iteration. For non-ordered reductions all UF parts are
+ // generated.
+ bool SinglePartNeeded =
+ isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+ bool NeedsScalar =
+ isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+ unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
+
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *Phi = State->get(PhiR, Part, NeedsScalar);
+ Value *Val =
+ State->get(PhiR->getBackedgeValue(),
+ SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
+ cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+ }
}
}
-
State->CFG.DTU.flush();
assert(State->CFG.DTU.getDomTree().verify(
DominatorTree::VerificationLevel::Fast) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 64242e43c56bc8..7b6fb2f31f2d18 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3311,6 +3311,7 @@ class VPRegionBlock : public VPBlockBase {
assert(!isReplicator() && "should only get pre-header of loop regions");
return getSinglePredecessor()->getExitingBasicBlock();
}
+ void clearEntry() { Entry = nullptr; }
/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b722ec34ee6fb6..83726b54fea107 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -702,16 +702,46 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
return;
- LLVMContext &Ctx = SE.getContext();
- auto *BOC =
- new VPInstruction(VPInstruction::BranchOnCond,
- {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
-
SmallVector<VPValue *> PossiblyDead(Term->operands());
Term->eraseFromParent();
+ VPBasicBlock *Header =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+ if (all_of(Header->phis(), [](VPRecipeBase &R) {
+ return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R);
+ })) {
+ for (VPRecipeBase &R : make_early_inc_range(Header->phis())) {
+ auto *P = cast<VPHeaderPHIRecipe>(&R);
+ P->replaceAllUsesWith(P->getStartValue());
+ P->eraseFromParent();
+ }
+
+ VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor();
+ auto HeaderSuccs = to_vector(Header->getSuccessors());
+ VPBasicBlock *Exiting =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+
+ auto *LoopRegion = Plan.getVectorLoopRegion();
+ VPBlockBase *Middle = LoopRegion->getSingleSuccessor();
+ VPBlockUtils::disconnectBlocks(Preheader, LoopRegion);
+ VPBlockUtils::disconnectBlocks(LoopRegion, Middle);
+
+ Header->setParent(nullptr);
+ Exiting->setParent(nullptr);
+ VPBlockUtils::connectBlocks(Preheader, Header);
+
+ VPBlockUtils::connectBlocks(Exiting, Middle);
+ LoopRegion->clearEntry();
+ delete LoopRegion;
+ } else {
+ LLVMContext &Ctx = SE.getContext();
+ auto *BOC =
+ new VPInstruction(VPInstruction::BranchOnCond,
+ {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+ ExitingVPBB->appendRecipe(BOC);
+ }
for (VPValue *Op : PossiblyDead)
recursivelyDeleteDeadRecipes(Op);
- ExitingVPBB->appendRecipe(BOC);
Plan.setVF(BestVF);
Plan.setUF(BestUF);
// TODO: Further simplifications are possible
@@ -720,7 +750,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
}
/// Sink users of \p FOR after the recipe defining the previous value \p
-/// Previous of the recurrence. \returns true if all users of \p FOR could be
+// Previous of the recurrence. \returns true if all users of \p FOR could be
/// re-arranged as needed or false if it is not possible.
static bool
sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index f90524fde79654..1b3dabef5333a6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -81,17 +81,13 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
@@ -105,7 +101,7 @@ define void @powi_call(ptr %P) {
; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -236,6 +232,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index ec50b0cac03829..13ebfa82f1da62 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -49,28 +49,25 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0
+; CHECK-NEXT: [[...
[truncated]
|
6d1cc96
to
dddcde5
Compare
This now depends #110004 and contains changes from it |
Use VPInstruction::ResumePhi to create phi nodes for reduction resume values. This allows simplifying createAndCollectMergePhiForReduction to only collect reduction resume phis when vectorizing epilogue loops and adding extra incoming edges from the main vector loop.
dddcde5
to
5f8fabe
Compare
if (VectorizedLoopID) | ||
L->setLoopID(*VectorizedLoopID); | ||
else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (VectorizedLoopID) | |
L->setLoopID(*VectorizedLoopID); | |
else { | |
if (VectorizedLoopID) { | |
L->setLoopID(*VectorizedLoopID); | |
} else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
|
||
VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); | ||
auto HeaderSuccs = to_vector(Header->getSuccessors()); | ||
VPBasicBlock *Exiting = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPBasicBlock *Exiting = | |
auto *Exiting = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
SmallVector<VPValue *> PossiblyDead(Term->operands()); | ||
Term->eraseFromParent(); | ||
VPBasicBlock *Header = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPBasicBlock *Header = | |
auto *Header = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -714,7 +744,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||
} | |||
|
|||
/// Sink users of \p FOR after the recipe defining the previous value \p | |||
/// Previous of the recurrence. \returns true if all users of \p FOR could be | |||
// Previous of the recurrence. \returns true if all users of \p FOR could be |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Previous of the recurrence. \returns true if all users of \p FOR could be | |
/// Previous of the recurrence. \returns true if all users of \p FOR could be |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed change, thanks!
…ion-instead-of-using-branch-on-cond-true
…ion-instead-of-using-branch-on-cond-true
…ion-instead-of-using-branch-on-cond-true
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rebased after landing #110004
ping :)
if (VectorizedLoopID) | ||
L->setLoopID(*VectorizedLoopID); | ||
else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
SmallVector<VPValue *> PossiblyDead(Term->operands()); | ||
Term->eraseFromParent(); | ||
VPBasicBlock *Header = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
|
||
VPBlockBase *Preheader = Plan.getVectorLoopRegion()->getSinglePredecessor(); | ||
auto HeaderSuccs = to_vector(Header->getSuccessors()); | ||
VPBasicBlock *Exiting = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -714,7 +744,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||
} | |||
|
|||
/// Sink users of \p FOR after the recipe defining the previous value \p | |||
/// Previous of the recurrence. \returns true if all users of \p FOR could be | |||
// Previous of the recurrence. \returns true if all users of \p FOR could be |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed change, thanks!
…ion-instead-of-using-branch-on-cond-true
ping :) |
if (isa<VPWidenPointerInductionRecipe>(&R) || | ||
isa<VPWidenIntOrFpInductionRecipe>(&R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (isa<VPWidenPointerInductionRecipe>(&R) || | |
isa<VPWidenIntOrFpInductionRecipe>(&R)) { | |
if (isa<VPWidenPointerInductionRecipe, VPWidenIntOrFpInductionRecipe>(&R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
(isa<VPReductionPHIRecipe>(PhiR) && | ||
cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replace isa+cast by dyn_cast?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks!
if (all_of(Header->phis(), [](VPRecipeBase &R) { | ||
return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R); | ||
})) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (all_of(Header->phis(), [](VPRecipeBase &R) { | |
return !isa<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>(&R); | |
})) { | |
if (none_of(Header->phis(), IsaPred<VPWidenIntOrFpInductionRecipe, VPReductionPHIRecipe>)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, updated and reordered conditions
LoopRegion->clearEntry(); | ||
delete LoopRegion; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potentially unsafe, may leak if the programmer forgets about it. Maybe, use Destructor or something like RAII?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if there is a nice way to do so currently. The whole replace-and-erase logic in the block needs to happen atomically at the moment. To avoid having to do manual delete's, VPlan could track all blocks added to it and delete them all when the VPlan is deleted (at the moment it only deletes reachable blocks). But that would require some extra logic to make sure all blocks are added properly.
…ion-instead-of-using-branch-on-cond-true
You can test this locally with the following command:git-clang-format --diff 52bbe20eb40f45bc64c614b6b3d7fe13bbacb0ff 1f4febcf652bc940361e25243200f6dfe07fe486 --extensions h,cpp -- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/VPlan.cpp llvm/lib/Transforms/Vectorize/VPlan.h llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp llvm/unittests/Transforms/Vectorize/VPlanTest.cpp llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp View the diff from clang-format here.diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 12efb6c9d5..059372814d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1037,31 +1037,31 @@ void VPlan::execute(VPTransformState *State) {
if (isa<VPWidenPHIRecipe>(&R))
continue;
- if (isa<VPWidenInductionRecipe>(&R)) {
- PHINode *Phi = nullptr;
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
- } else {
- auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
- assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
- "recipe generating only scalars should have been replaced");
- auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
- Phi = cast<PHINode>(GEP->getPointerOperand());
+ if (isa<VPWidenInductionRecipe>(&R)) {
+ PHINode *Phi = nullptr;
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+ } else {
+ auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+ assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+ "recipe generating only scalars should have been replaced");
+ auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+ Phi = cast<PHINode>(GEP->getPointerOperand());
+ }
+
+ Phi->setIncomingBlock(1, VectorLatchBB);
+
+ // Move the last step to the end of the latch block. This ensures
+ // consistent placement of all induction updates.
+ Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+
+ // Use the steps for the last part as backedge value for the induction.
+ if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+ Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+ continue;
}
- Phi->setIncomingBlock(1, VectorLatchBB);
-
- // Move the last step to the end of the latch block. This ensures
- // consistent placement of all induction updates.
- Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
-
- // Use the steps for the last part as backedge value for the induction.
- if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
- Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
- continue;
- }
-
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
|
…ion-instead-of-using-branch-on-cond-true
Updated to current |
This patch adds a WideIVStep opcode that can be used to create a vector with the steps to increment a wide induction. The opcode has 3 operands * the vector step * the scale of the vector step * a constant indicating the target type of the VPInstruction (this is working around having explicit types for VPInstructions, we could also introduce a dedicated recipe, at the cost of a lot more scaffolding) The opcode is later converted into a sequence of recipes that convert the scale and step to the target type, if needed, and then multiply vector step by scale. This simplifies code that needs to materialize step vectors, e.g. replacing wide IVs as follow up to llvm#108378 with an increment of the wide IV step.
This patch adds a WideIVStep opcode that can be used to create a vector with the steps to increment a wide induction. The opcode has 3 operands * the vector step * the scale of the vector step * a constant indicating the target type of the VPInstruction (this is working around having explicit types for VPInstructions, we could also introduce a dedicated recipe, at the cost of a lot more scaffolding) The opcode is later converted into a sequence of recipes that convert the scale and step to the target type, if needed, and then multiply vector step by scale. This simplifies code that needs to materialize step vectors, e.g. replacing wide IVs as follow up to llvm#108378 with an increment of the wide IV step.
…ion-instead-of-using-branch-on-cond-true
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
After merging initial skeleton creation, we cannot easily get the vector preheader any longer without the region. Putting this on hold for a bit. |
Unfortunately on latest main we cannot easily identify the vector preheader when constructing the skeleton if the vector loop region is removed :( Need to think if there is a nice solution |
This patch changes the way blocks are managed by VPlan. Previously all blocks reachable from entry would be cleaned up when a VPlan is destroyed. With this patch, each VPlan keeps track of blocks created for it in a list and this list is then used to delete all blocks in the list when the VPlan is destroyed. To do so, block creation is funneled through helpers in directly in VPlan. The main advantage of doing so is it simplifies CFG transformations, as those do not have to take care of deleting any blocks, just adjusting the CFG. This helps to simplify llvm#108378 and llvm#106748. This also simplifies handling of 'immutable' blocks a VPlan holds references to, which at the moment only include the scalar header block. Note that the original constructors taking VPBlockBase are retained at the moment for unit tests.
…ion-instead-of-using-branch-on-cond-true
This patch changes the way blocks are managed by VPlan. Previously all blocks reachable from entry would be cleaned up when a VPlan is destroyed. With this patch, each VPlan keeps track of blocks created for it in a list and this list is then used to delete all blocks in the list when the VPlan is destroyed. To do so, block creation is funneled through helpers in directly in VPlan. The main advantage of doing so is it simplifies CFG transformations, as those do not have to take care of deleting any blocks, just adjusting the CFG. This helps to simplify llvm#108378 and llvm#106748. This also simplifies handling of 'immutable' blocks a VPlan holds references to, which at the moment only include the scalar header block. Note that the original constructors taking VPBlockBase are retained at the moment for unit tests.
…nstead-of-using-branch-on-cond-true
Update optimizeForVFAndUF to completely remove the vector loop region when possible. At the moment, we cannot remove the region if it contains
Both cases can be addressed by more explicit modeling.
The patch also includes a number of updates to allow executing VPlans without a vector loop region which can be split off.
Depends on #110004