Skip to content

Commit

Permalink
[LV] Ignore some costs when loop gets fully unrolled (#106699)
Browse files Browse the repository at this point in the history
When VF has a fixed width and equals the number of iterations, and we are not
tail folding by masking, comparison instruction and induction operation will be DCEed later.
Ignoring the costs of these instructions improves the cost model.
  • Loading branch information
igogo-x86 authored Dec 9, 2024
1 parent 73adf26 commit 337936a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 14 deletions.
41 changes: 40 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2682,6 +2682,25 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
return I->second;
}

/// Knowing that loop \p L executes a single vector iteration, add instructions
/// that will get simplified and thus should not have any cost to \p
/// InstsToIgnore.
static void addFullyUnrolledInstructionsToIgnore(
Loop *L, const LoopVectorizationLegality::InductionList &IL,
SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
auto *Cmp = L->getLatchCmpInst();
if (Cmp)
InstsToIgnore.insert(Cmp);
for (const auto &[IV, IndDesc] : IL) {
// Get next iteration value of the induction variable.
Instruction *IVInst =
cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
if (all_of(IVInst->users(),
[&](const User *U) { return U == IV || U == Cmp; }))
InstsToIgnore.insert(IVInst);
}
}

void InnerLoopVectorizer::createInductionResumeVPValues(
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
SmallPtrSetImpl<PHINode *> *IVSubset) {
Expand Down Expand Up @@ -5592,14 +5611,23 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
InstructionCost Cost;

// If the vector loop gets executed exactly once with the given VF, ignore the
// costs of comparison and induction instructions, as they'll get simplified
// away.
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
ValuesToIgnoreForVF);

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
InstructionCost BlockCost;

// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
if (ValuesToIgnore.count(&I) ||
if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
(VF.isVector() && VecValuesToIgnore.count(&I)))
continue;

Expand Down Expand Up @@ -7281,6 +7309,17 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
continue;
IVInsts.push_back(CI);
}

// If the vector loop gets executed exactly once with the given VF, ignore
// the costs of comparison and induction instructions, as they'll get
// simplified away.
// TODO: Remove this code after stepping away from the legacy cost model and
// adding code to simplify VPlans before calculating their costs.
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
CostCtx.SkipCostComputation);

for (Instruction *IVInst : IVInsts) {
if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
continue;
Expand Down
18 changes: 5 additions & 13 deletions llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 26
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 50
; CHECK: LV: Selecting VF: vscale x 2
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body

Expand Down Expand Up @@ -50,9 +48,8 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK: Cost for VF 8: 26
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 50
; CHECK: Cost for VF 16: 49
; CHECK: LV: Selecting VF: vscale x 2
entry:
br label %for.body
Expand Down Expand Up @@ -86,13 +83,10 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 27
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 51
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
Expand Down Expand Up @@ -125,11 +119,9 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 12
; CHECK-NEXT: Cost of 8 for VF 16: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 8 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 16: 20
; CHECK: Cost for VF 16: 4
; CHECK: LV: Selecting VF: 16
entry:
br label %for.body
Expand Down

0 comments on commit 337936a

Please sign in to comment.