From fbb939a6e3806260974663d600259cccedd218e3 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Fri, 15 Nov 2024 14:35:22 +0000
Subject: [PATCH] Addressing suggestions

* Fixing comments
* Adding more tests
* Remove cmp latch presence requirements
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  42 +++----
 .../AArch64/fully-unrolled-cost.ll            | 113 +++++++++++++++---
 2 files changed, 114 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0a778bd5e6e05a..64a2885aab75cc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2648,29 +2648,21 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
   return I->second;
 }
 
-/// Knowing that loop \p L would be fully unrolled after vectorisation, add
-/// instructions that will get simplified and thus should not have any cost to
-/// \p InstsToIgnore
-static void AddFullyUnrolledInstructionsToIgnore(
+/// Knowing that loop \p L executes a single vector iteration, add instructions
+/// that will get simplified and thus should not have any cost to \p
+/// InstsToIgnore.
+static void addFullyUnrolledInstructionsToIgnore(
     Loop *L, const LoopVectorizationLegality::InductionList &IL,
     SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
   auto *Cmp = L->getLatchCmpInst();
-  if (!Cmp)
-    return;
-  InstsToIgnore.insert(Cmp);
+  if (Cmp)
+    InstsToIgnore.insert(Cmp);
   for (const auto &[IV, IndDesc] : IL) {
-    // Get next iteration value of the induction variable
+    // Get next iteration value of the induction variable.
     Instruction *IVInst =
         cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
-    bool IsSimplifiedAway = true;
-    // Check that this value used only to exit the loop
-    for (auto *UIV : IVInst->users()) {
-      if (UIV != IV && UIV != Cmp) {
-        IsSimplifiedAway = false;
-        break;
-      }
-    }
-    if (IsSimplifiedAway)
+    if (all_of(IVInst->users(),
+               [&](const User *U) { return U == IV || U == Cmp; }))
       InstsToIgnore.insert(IVInst);
   }
 }
@@ -5561,12 +5553,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   InstructionCost Cost;
 
-  // If with the given fixed width VF loop gets fully unrolled, ignore the costs
-  // of comparison and induction instructions, as they'll get simplified away
+  // If the vector loop gets executed exactly once with the given VF, ignore the
+  // costs of comparison and induction instructions, as they'll get simplified
+  // away.
   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (VF.isFixed() && TC == VF.getFixedValue())
-    AddFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
+    addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
                                          ValuesToIgnoreForVF);
 
   // For each block.
@@ -7259,11 +7252,14 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
       IVInsts.push_back(CI);
     }
 
-    // If with the given VF loop gets fully unrolled, ignore the costs of
-    // comparison and induction instructions, as they'll get simplified away
+    // If the vector loop gets executed exactly once with the given VF, ignore
+    // the costs of comparison and induction instructions, as they'll get
+    // simplified away.
+    // TODO: Remove this code after stepping away from the legacy cost model and
+    // adding code to simplify VPlans before calculating their costs.
     auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
     if (VF.isFixed() && TC == VF.getFixedValue())
-      AddFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
+      addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
                                            CostCtx.SkipCostComputation);
 
     for (Instruction *IVInst : IVInsts) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 07a8b1a3ed482e..69cf8b40752c6e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -1,38 +1,115 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
+; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
 
 target triple="aarch64--linux-gnu"
 
+; This test shows that comparison and next iteration IV have zero cost if the
+; vector loop gets executed exactly once with the given VF.
 define i64 @test(ptr %a, ptr %b) #0 {
 ; CHECK: LV: Checking a loop in 'test'
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, 16
-; CHECK: LV: Vector loop of width 8 costs: 3.
-; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, 16
-; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 26
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 48
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
 
-for.cond.cleanup:                                 ; preds = %for.body
-  %add.lcssa = phi i64 [ %add, %for.body ]
-  ret i64 %add.lcssa
+exit:                                 ; preds = %for.body
+  ret i64 %add
 
 for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %sum.09 = phi i64 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
   %0 = load i8, ptr %arrayidx, align 1
   %conv = zext i8 %0 to i64
-  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
   %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %mul, %sum.09
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 16
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %add = add i64 %mul, %sum
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; Same as above, but in the next iteration IV has extra users, and thus, the cost is not zero.
+define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
+; CHECK: LV: Checking a loop in 'test_external_iv_user'
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 26
+; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 49
+; CHECK: LV: Selecting VF: vscale x 2
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %sum, %mul
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+}
+
+; Same as above but with two IVs without extra users. They all have zero cost when VF equals the number of iterations.
+define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
+; CHECK: LV: Checking a loop in 'test_two_ivs'
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: induction instruction   %j.iv.next = add nuw nsw i64 %j.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 27
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 48
+; CHECK: LV: Selecting VF: 16
+entry:
+  br label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %j.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %mul, %sum
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %j.iv.next = add nuw nsw i64 %j.iv, 1
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
 }
 
 attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }