From fbb939a6e3806260974663d600259cccedd218e3 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Fri, 15 Nov 2024 14:35:22 +0000 Subject: [PATCH] Addressing suggestions * Fixing comments * Adding more tests * Remove cmp latch presence requirements --- .../Transforms/Vectorize/LoopVectorize.cpp | 42 +++---- .../AArch64/fully-unrolled-cost.ll | 113 +++++++++++++++--- 2 files changed, 114 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0a778bd5e6e05a..64a2885aab75cc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2648,29 +2648,21 @@ static Value *getExpandedStep(const InductionDescriptor &ID, return I->second; } -/// Knowing that loop \p L would be fully unrolled after vectorisation, add -/// instructions that will get simplified and thus should not have any cost to -/// \p InstsToIgnore -static void AddFullyUnrolledInstructionsToIgnore( +/// Knowing that loop \p L executes a single vector iteration, add instructions +/// that will get simplified and thus should not have any cost to \p +/// InstsToIgnore. +static void addFullyUnrolledInstructionsToIgnore( Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl &InstsToIgnore) { auto *Cmp = L->getLatchCmpInst(); - if (!Cmp) - return; - InstsToIgnore.insert(Cmp); + if (Cmp) + InstsToIgnore.insert(Cmp); for (const auto &[IV, IndDesc] : IL) { - // Get next iteration value of the induction variable + // Get next iteration value of the induction variable. Instruction *IVInst = cast(IV->getIncomingValueForBlock(L->getLoopLatch())); - bool IsSimplifiedAway = true; - // Check that this value used only to exit the loop - for (auto *UIV : IVInst->users()) { - if (UIV != IV && UIV != Cmp) { - IsSimplifiedAway = false; - break; - } - } - if (IsSimplifiedAway) + if (all_of(IVInst->users(), + [&](const User *U) { return U == IV || U == Cmp; })) InstsToIgnore.insert(IVInst); } } @@ -5561,12 +5553,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { InstructionCost Cost; - // If with the given fixed width VF loop gets fully unrolled, ignore the costs - // of comparison and induction instructions, as they'll get simplified away + // If the vector loop gets executed exactly once with the given VF, ignore the + // costs of comparison and induction instructions, as they'll get simplified + // away. SmallPtrSet ValuesToIgnoreForVF; auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (VF.isFixed() && TC == VF.getFixedValue()) - AddFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), + addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), ValuesToIgnoreForVF); // For each block. @@ -7259,11 +7252,14 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, IVInsts.push_back(CI); } - // If with the given VF loop gets fully unrolled, ignore the costs of - // comparison and induction instructions, as they'll get simplified away + // If the vector loop gets executed exactly once with the given VF, ignore + // the costs of comparison and induction instructions, as they'll get + // simplified away. + // TODO: Remove this code after stepping away from the legacy cost model and + // adding code to simplify VPlans before calculating their costs. auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); if (VF.isFixed() && TC == VF.getFixedValue()) - AddFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), + addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), CostCtx.SkipCostComputation); for (Instruction *IVInst : IVInsts) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 07a8b1a3ed482e..69cf8b40752c6e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -1,38 +1,115 @@ ; REQUIRES: asserts -; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s +; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s target triple="aarch64--linux-gnu" +; This test shows that comparison and next iteration IV have zero cost if the +; vector loop gets executed exactly once with the given VF. define i64 @test(ptr %a, ptr %b) #0 { ; CHECK: LV: Checking a loop in 'test' -; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16 -; CHECK: LV: Vector loop of width 8 costs: 3. -; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction: %exitcond.not = icmp eq i64 %indvars.iv.next, 16 -; CHECK: LV: Vector loop of width 16 costs: 3. +; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 +; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 8: 26 +; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 16: 48 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body -for.cond.cleanup: ; preds = %for.body - %add.lcssa = phi i64 [ %add, %for.body ] - ret i64 %add.lcssa +exit: ; preds = %for.body + ret i64 %add for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %sum.09 = phi i64 [ 0, %entry ], [ %add, %for.body ] - %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv %0 = load i8, ptr %arrayidx, align 1 %conv = zext i8 %0 to i64 - %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum.09 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 16 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + %add = add i64 %mul, %sum + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body +} + +; Same as above, but in the next iteration IV has extra users, and thus, the cost is not zero. +define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { +; CHECK: LV: Checking a loop in 'test_external_iv_user' +; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 +; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 8: 26 +; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 +; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 16: 49 +; CHECK: LV: Selecting VF: vscale x 2 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i64 + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next + %1 = load i8, ptr %arrayidx2, align 1 + %conv3 = zext i8 %1 to i64 + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %sum, %mul + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i64 %add +} + +; Same as above but with two IVs without extra users. They all have zero cost when VF equals the number of iterations. +define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { +; CHECK: LV: Checking a loop in 'test_two_ivs' +; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 +; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 +; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] +; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 +; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 8: 27 +; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] +; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] +; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost for VF 16: 48 +; CHECK: LV: Selecting VF: 16 +entry: + br label %for.body + +exit: ; preds = %for.body + ret i64 %add + +for.body: ; preds = %entry, %for.body + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i64 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %j.iv + %1 = load i8, ptr %arrayidx2, align 1 + %conv3 = zext i8 %1 to i64 + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %mul, %sum + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %j.iv.next = add nuw nsw i64 %j.iv, 1 + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body } attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }