[VPlan] Dispatch to multiple exit blocks via middle blocks. #112138

fhahn · 2024-10-13T13:53:47Z

A more lightweight variant of #109193,
which dispatches to multiple exit blocks via the middle blocks.

The patch also introduces a bit of required scaffolding to enable early-exit vectorization, including an option. At the moment, early-exit vectorization doesn't come with legality checks, and is only used if the option is provided and the loop has metadata forcing vectorization. This is only intended to be used for testing during bring-up, with @david-arm enabling auto early-exit vectorization plugging in the changes from #88385.

github-actions · 2024-10-22T02:14:42Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvmbot · 2024-10-22T02:16:21Z

@llvm/pr-subscribers-vectorizers

Author: Florian Hahn (fhahn)

Changes

A more lightweight variant of #109193,
which dispatches to multiple exit blocks via the middle blocks.

The patch also introduces a bit of required scaffolding to enable early-exit vectorization, including an option. At the moment, early-exit vectorization doesn't come with legality checks, and is only used if the option is provided and the loop has metadata forcing vectorization. This is only intended to be used for testing during bring-up, with @david-arm enabling auto early-exit vectorization plugging in the changes from #88385.

Patch is 48.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112138.diff

12 Files Affected:

(modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+3)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+29)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+51-30)
(modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+36-8)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+1)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+20-5)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+82)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4)
(modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (-8)
(added) llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll (+240)
(modified) llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll (+9-9)
(added) llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll (+148)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
+  /// Returns true if the loop has an early exit that we can vectorize.
+  bool canVectorizeEarlyExit() const;
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
+static cl::opt<bool>
+    EnableEarlyExitVectorization("enable-early-exit-vectorization",
+                                 cl::init(false), cl::Hidden, cl::desc(""));
+
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  // When vectorizing early exits, create predicates for all blocks, except the
+  // header.
+  if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+    return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+  // Currently only allow vectorizing loops with early exits, if early-exit
+  // vectorization is explicitly enabled and the loop has metadata to force
+  // vectorization.
+  if (!EnableEarlyExitVectorization)
+    return false;
+
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  if (Exiting.size() == 1)
+    return false;
+
+  LoopVectorizeHints Hints(TheLoop, true, *ORE);
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = TheLoop->getHeader()->getParent();
+  return Hints.allowVectorization(Fn, TheLoop,
+                                  true /*VectorizeOnlyWhenForced*/);
+}
+
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..c80d45b1479b36 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeEarlyExit()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+          Legal->canVectorizeEarlyExit()) &&
          "multiple exit loop without required epilogue?");
 
   LoopMiddleBlock =
@@ -2758,7 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
 
   DenseMap<Value *, Value *> MissingVals;
 
@@ -2819,6 +2821,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
   }
+
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block");
 }
 
 namespace {
@@ -3599,7 +3604,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -7692,12 +7698,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+    ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+    for (VPRecipeBase &R : *ExitVPBB) {
+      createAndCollectMergePhiForReduction(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
   }
 
   // 2.6. Maintain Loop Hints
@@ -7723,6 +7732,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
+
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7745,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (ExitVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -8128,7 +8140,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
+  if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8790,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8877,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())
+    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9162,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
-  addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  addUsersInExitBlock(*Plan, ExitUsersToFix);
+  if (Legal->canVectorizeEarlyExit()) {
+    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+                                        RecipeBuilder);
+  } else {
+    SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+        OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+    addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+    addUsersInExitBlock(*Plan, ExitUsersToFix);
+  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9298,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9316,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(UserRecipe->getParent() == MiddleVPBB &&
-                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9403,6 +9420,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())
+    return;
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&
+        PredVPBlock == getPlan()->getVectorLoopRegion() &&
+        PredVPBlock->getNumSuccessors()) {
+      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+      PredBB = TermBr->getSuccessor(1);
+      TermBr = cast<BranchInst>(PredBB->getTerminator());
+      idx = 0;
+    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-  // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (IRExitBlock) {
+    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+    // The connection order corresponds to the operands of the conditional
+    // branch.
+    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  }
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1
+          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
       MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
   assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
          "scalar preheader cannot be wrapped already");
+  if (ScalarPhVPBB->getNumSuccessors() != 0) {
+    ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+    MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+  }
   replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
   replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
@@ -1056,12 +1074,19 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
 
   // Generate code in the loop pre-header and body.
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+
+  for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
+  if (!getVectorLoopRegion()->getSingleSuccessor())
+    VectorLatchBB =
+        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1088,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+        Inc->moveBefore(VectorLatchBB->getTerminator());
+      else
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    AnyOf,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::AnyOf:
     return true;
   default:
     return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::AnyOf: {
+    Value *A = State.get(getOperand(0));
+    return Builder.CreateOrReduce(A);
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::AnyOf;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return false;
   case Instruction::ICmp:
   case Instruction::Select:
+  case Instruction::Or:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::AnyOf...
[truncated]

llvmbot · 2024-10-22T02:16:21Z

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

Changes

A more lightweight variant of #109193,
which dispatches to multiple exit blocks via the middle blocks.

The patch also introduces a bit of required scaffolding to enable early-exit vectorization, including an option. At the moment, early-exit vectorization doesn't come with legality checks, and is only used if the option is provided and the loop has metadata forcing vectorization. This is only intended to be used for testing during bring-up, with @david-arm enabling auto early-exit vectorization plugging in the changes from #88385.

Patch is 48.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112138.diff

12 Files Affected:

(modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+3)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+29)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+51-30)
(modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+36-8)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+1)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+20-5)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+82)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4)
(modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (-8)
(added) llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll (+240)
(modified) llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll (+9-9)
(added) llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll (+148)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
+  /// Returns true if the loop has an early exit that we can vectorize.
+  bool canVectorizeEarlyExit() const;
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
+static cl::opt<bool>
+    EnableEarlyExitVectorization("enable-early-exit-vectorization",
+                                 cl::init(false), cl::Hidden, cl::desc(""));
+
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  // When vectorizing early exits, create predicates for all blocks, except the
+  // header.
+  if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+    return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+  // Currently only allow vectorizing loops with early exits, if early-exit
+  // vectorization is explicitly enabled and the loop has metadata to force
+  // vectorization.
+  if (!EnableEarlyExitVectorization)
+    return false;
+
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  if (Exiting.size() == 1)
+    return false;
+
+  LoopVectorizeHints Hints(TheLoop, true, *ORE);
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = TheLoop->getHeader()->getParent();
+  return Hints.allowVectorization(Fn, TheLoop,
+                                  true /*VectorizeOnlyWhenForced*/);
+}
+
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..c80d45b1479b36 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeEarlyExit()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+          Legal->canVectorizeEarlyExit()) &&
          "multiple exit loop without required epilogue?");
 
   LoopMiddleBlock =
@@ -2758,7 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
 
   DenseMap<Value *, Value *> MissingVals;
 
@@ -2819,6 +2821,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
   }
+
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block");
 }
 
 namespace {
@@ -3599,7 +3604,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -7692,12 +7698,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+    ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+    for (VPRecipeBase &R : *ExitVPBB) {
+      createAndCollectMergePhiForReduction(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
   }
 
   // 2.6. Maintain Loop Hints
@@ -7723,6 +7732,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
+
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7745,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (ExitVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -8128,7 +8140,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
+  if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8790,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8877,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())
+    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9162,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
-  addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  addUsersInExitBlock(*Plan, ExitUsersToFix);
+  if (Legal->canVectorizeEarlyExit()) {
+    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+                                        RecipeBuilder);
+  } else {
+    SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+        OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+    addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+    addUsersInExitBlock(*Plan, ExitUsersToFix);
+  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9298,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9316,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(UserRecipe->getParent() == MiddleVPBB &&
-                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9403,6 +9420,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())
+    return;
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&
+        PredVPBlock == getPlan()->getVectorLoopRegion() &&
+        PredVPBlock->getNumSuccessors()) {
+      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+      PredBB = TermBr->getSuccessor(1);
+      TermBr = cast<BranchInst>(PredBB->getTerminator());
+      idx = 0;
+    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-  // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (IRExitBlock) {
+    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+    // The connection order corresponds to the operands of the conditional
+    // branch.
+    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  }
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1
+          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
       MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
   assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
          "scalar preheader cannot be wrapped already");
+  if (ScalarPhVPBB->getNumSuccessors() != 0) {
+    ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+    MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+  }
   replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
   replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
@@ -1056,12 +1074,19 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
 
   // Generate code in the loop pre-header and body.
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+
+  for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
+  if (!getVectorLoopRegion()->getSingleSuccessor())
+    VectorLatchBB =
+        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1088,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+        Inc->moveBefore(VectorLatchBB->getTerminator());
+      else
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    AnyOf,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::AnyOf:
     return true;
   default:
     return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::AnyOf: {
+    Value *A = State.get(getOperand(0));
+    return Builder.CreateOrReduce(A);
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::AnyOf;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return false;
   case Instruction::ICmp:
   case Instruction::Select:
+  case Instruction::Or:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::AnyOf...
[truncated]

A more lightweight variant of llvm#109193, which dispatches to multiple exit blocks via the middle blocks.

david-arm

Thanks for this! I realise you're busy with the LLVM dev conference, but thought I'd leave a few comments I have so far ...

david-arm · 2024-10-22T08:12:48Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
  assert(LoopVectorPreHeader && "Invalid loop structure");
  LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||


I think we can completely remove the need for the LoopExitBlock variable - see #108231, since it's only ever used in one place. And even then what's actually meant by LoopExitBlock in createEpilogueVectorizedLoopSkeleton is the exit block from the latch.

Then in #88385 I think we can replace this assert with:

assert((OrigLoop->getUniqueLatchExitBlock() || Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?");

because even if canVectorizeEarlyExit returns true I think we still require an exit from the latch.

Sounds good!

david-arm · 2024-10-22T08:14:38Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeEarlyExit()) {


I think you can fold the conditions together into a single if statement:

if (!Legal->canVectorizeEarlyExit() && TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { ... }

folded thanks!

david-arm · 2024-10-22T08:19:30Z

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                       cl::desc("Enable recognition of non-constant strided "
                                "pointer induction variables."));

+static cl::opt<bool>


Do we need a description here, such as what I had in https://github.com/llvm/llvm-project/pull/88385/files for the same flag? Or is the idea to try to not expose this too much at this stage?

Updated to move it to LoopVectorize.cpp, thanks. Originally this was only used in combination with the new helper introduced to LVL, but that changed after using the existing checks.

Hmm, the flag still seems to be in the old place. Perhaps the patch hasn't updated correctly?

david-arm · 2024-10-22T08:24:11Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
    if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
      PHI->addIncoming(I.second, MiddleBlock);
  }
+
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&


Perhaps the assert would be more accurate with the statement Expected a single exit block for escaping values?

Also, might be worth moving the assert to before the for (auto &I : MissingVals) line because that's the point at which we start adjusting the original scalar code?

Updated, thanks!

david-arm · 2024-10-22T08:26:45Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {


Could rewrite this as:

if (auto *LoopSucc = BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) { ExitVPBB = cast<VPBasicBlock>(LoopSucc); ...

In fact there can only be one successor in this patch.

Yep, something left over from earlier versions. Removed, thanks!

david-arm · 2024-10-22T10:24:44Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+    BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+    BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+    VPIRBasicBlock *VPExitBlock;
+    if (OrigLoop->getUniqueExitBlock())


Again, this doesn't seem to interact well with loops that have countable early exits, which we already support vectorising. I will be adding support for loops with a mixture of countable and uncountable early exits (#88385), which will require a scalar epilogue for the final iteration (assuming we didn't leave via the uncountable exit). I imagine that's more efficient than testing the early countable exit in each vector iteration.

For the purposes of adding initial support for multiple loop region successors, you could also potentially simplify the algorithm by requiring canVectorizeEarlyExit to only permit one early exit? Or if you prefer to add support for multiple early exits then it makes sense to add tests for them. I think at the moment the tests only have a single, countable early exit.

Hmm, I think I understand what you're trying to do with this a bit more now. Essentially, this patch is trying to take an existing supported vectorisation case (early countable exits) and apply a new vectorisation style. This permits you to test the new code you're adding without having to worry about the legality of vectorising loops with uncountable exits. I guess it's just the new flag I'm worried about since it conflicts with my existing PR #88385. It's almost like you want a flag to control the flavour of vectorisation for an already support case, i.e. something like -prefer-multi-exit.

david-arm · 2024-10-22T10:37:16Z

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
  return true;
 }

+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {


It feels a little odd to have both canVectorizeEarlyExit and isVectorizableEarlyExitLoop in the same class. Would it make sense to move the hints check into isVectorizableEarlyExitLoop to bypass legality checks? You could even get the benefit of existing code to build up a list of countable and uncountable exits that can be used later on when splitting the middle.block.

Originally I tried to limit the scope to just support vectorizing loops with multiple countable exits, but this probably made things a bit more complicated for not too much gain. Updated to use the existing isVectorizableEarlyExitLoop

I removed the new codegen tests (only kept the VPlan version). Are there any existing tests already for which adding the flag would be sufficient now that this is using the existing checks?

I think you can also able enable early exit autovec with Transforms/LoopVectorize/single_early_exit.ll

david-arm · 2024-10-22T10:39:44Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+      BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+  VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+  Plan.getTripCount()->replaceAllUsesWith(NewTC);
+  Plan.resetTripCount(NewTC);


I don't think you need this code - the existing trip count already uses the predicated symbolic max backedge taken count.

The original version only handled loops with multiple countable exits, which is why this code is needed. Updated now to just handle ones with uncountable exits by using canVectorizeEarlyExit as suggested, removed the code here

david-arm · 2024-10-22T10:49:38Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+      VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+      ExitIRI->addOperand(V);


I think this is ok for now, but ultimately it will need handling in the same way as collectUsersInExitBlock. I have a downstream implementation of #88385 that avoids using VPLiveOut, etc. and refactors collectUsersInExitBlock to work for early exit blocks too. I'll put a patch up soon!

Does the ExitIRI need an ExtractFromEnd if the vp value is not LiveIn?

Update to use the generic logic, based on #115066

david-arm · 2024-10-22T10:55:35Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

@@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups(
      }
  }
 }
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,


It's not immediately obvious from the name what this means and is used for. I wonder if it's better named as something like handleUncountableEarlyExit to reflect what it will actually be used for? Or perhaps handleMultipleRegionSuccessors?

Updated, thanks!

david-arm

Sorry @fhahn I left a few more comments for you to look at when you get back from the conference!

david-arm · 2024-10-23T15:09:34Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {


In fact there can only be one successor in this patch.

david-arm · 2024-10-23T15:09:58Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())


There will only ever be one successor.

Removed, thanks!

david-arm · 2024-10-23T15:10:12Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
  // TODO: Should be replaced by
  // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
  // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())


Only one successor permitted. Can delete this I think?

Done, thanks!

david-arm · 2024-10-23T15:10:36Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
  }
  VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
  Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())


Again, only one successor.

Removed, thanks!

david-arm · 2024-10-23T15:11:02Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
    // backedges. A backward successor is set when the branch is created.
    const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
    unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&


I think this code was for a different version of the patch using BranchOnMultiCond - can be deleted I think?

Indeed, removed, thanks!

david-arm · 2024-10-23T15:11:46Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
  // VPlan execution rather than earlier during VPlan construction.
  BasicBlock *MiddleBB = State->CFG.ExitBB;
  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1


One successor only.

Removed, thanks!

david-arm · 2024-10-23T15:11:59Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

    Block->execute(State);

  VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
  BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];

+  if (!getVectorLoopRegion()->getSingleSuccessor())


One successor only.

Removed, thanks!

This work is in preparation for PRs llvm#112138 and llvm#88385 where the middle block is not guaranteed to be the immediate successor to the region block. I've simply add new getMiddleBlock() interfaces to VPlan that for now just return cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()) Once PR llvm#112138 lands we'll need to do more work to discover the middle block.

fhahn · 2024-10-28T09:29:32Z

Sorry @fhahn I left a few more comments for you to look at when you get back from the conference!

Great thanks, will get to them shortly, more to clean up after moving from #109193

This work is in preparation for PRs llvm#112138 and llvm#88385 where the middle block is not guaranteed to be the immediate successor to the region block. I've simply add new getMiddleBlock() interfaces to VPlan that for now just return cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()) Once PR llvm#112138 lands we'll need to do more work to discover the middle block.

…-cond-in-middle

fhahn

Addressed a first batch of comments, more to follow soon!

fhahn · 2024-10-31T17:33:40Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {


Yep, something left over from earlier versions. Removed, thanks!

fhahn · 2024-10-31T17:33:52Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
    Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
    const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())


Removed, thanks!

fhahn · 2024-10-31T17:34:11Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
  // TODO: Should be replaced by
  // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
  // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())


Done, thanks!

fhahn · 2024-10-31T17:34:32Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
  }
  VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
  Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())


Removed, thanks!

fhahn · 2024-10-31T17:35:24Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
    // backedges. A backward successor is set when the branch is created.
    const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
    unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&


Indeed, removed, thanks!

fhahn · 2024-10-31T17:36:11Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
  // VPlan execution rather than earlier during VPlan construction.
  BasicBlock *MiddleBB = State->CFG.ExitBB;
  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1


Removed, thanks!

fhahn · 2024-10-31T17:36:40Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

    Block->execute(State);

  VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
  BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];

+  if (!getVectorLoopRegion()->getSingleSuccessor())


Removed, thanks!

This work is in preparation for PRs llvm#112138 and llvm#88385 where the middle block is not guaranteed to be the immediate successor to the region block. I've simply add new getMiddleBlock() interfaces to VPlan that for now just return cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()) Once PR llvm#112138 lands we'll need to do more work to discover the middle block.

This work is in preparation for PRs #112138 and #88385 where the middle block is not guaranteed to be the immediate successor to the region block. I've simply add new getMiddleBlock() interfaces to VPlan that for now just return cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()) Once PR #112138 lands we'll need to do more work to discover the middle block.

david-arm · 2024-11-01T15:07:29Z

Hi @fhahn, just for reference here is a branch that contains the changes in this PR as a single initial commit, followed by 3 more commits to add support for early exit auto-vectorisation. It's just a reworking of my original PR #88385, based off this new approach.

https://github.com/david-arm/llvm-project/tree/ee_autovec2

Once this PR lands I'll update PR #88385 accordingly. Hopefully the branch will give you an indication of what direction I'm moving towards and might help to make sense of the comments I left on this PR too!

This work is in preparation for PRs llvm#112138 and llvm#88385 where the middle block is not guaranteed to be the immediate successor to the region block. I've simply add new getMiddleBlock() interfaces to VPlan that for now just return cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()) Once PR llvm#112138 lands we'll need to do more work to discover the middle block.

fhahn

Should the block diagram of https://llvm.org/docs/Vectorizers.html#epilogue-vectorization be updated?

Added a brief section with a diagram of the VPlan for loops with early exits, thanks

fhahn · 2024-12-10T11:45:35Z

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -1375,6 +1375,17 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }

 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  // When vectorizing early exits, create predicates for the latch block. The


Updated, thanks!

fhahn · 2024-12-10T11:50:25Z

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

+  if (hasUncountableEarlyExit()) {
+    assert(
+        getUncountableExitingBlocks().size() == 1 &&


Updated to use the existing getUncountableEarlyExitingBlock which I missed originally, thanks

fhahn · 2024-12-10T11:50:47Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  assert((OrigLoop->getUniqueLatchExitBlock() ||
          Cost->requiresScalarEpilogue(VF.isVector())) &&
-         "multiple exit loop without required epilogue?");
+         "loops not exiting via the latch without required epilogue?");


Split off to 56ddbef

fhahn · 2024-12-10T11:50:55Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block for escaping values");


Split off to 0e7f187

fhahn · 2024-12-10T11:57:07Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

  TheLoop->getExitingBlocks(Exiting);
  for (BasicBlock *E : Exiting) {
    auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->hasUncountableEarlyExit()))


Done, also move the check to an early continue to make this easier to read., thanks

fhahn · 2024-12-10T13:38:17Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

@@ -124,6 +124,17 @@ struct VPlanTransforms {
  /// Remove dead recipes from \p Plan.
  static void removeDeadRecipes(VPlan &Plan);

+  /// Update \p Plan to account for uncountable exit blocks in \p
+  /// UncountableExitingBlocks by
+  ///  * updating the condition to exit the vector loop to include the early


Updated thanks

fhahn · 2024-12-10T13:38:28Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

+  ///  * updating the condition to exit the vector loop to include the early
+  ///    exit conditions
+  ///  * splitting the original middle block to branch to the early exit blocks
+  ///    if taken. Returns false if the transformation wasn't successful.


Updated, thanks

fhahn · 2024-12-10T13:38:48Z

llvm/test/Transforms/LoopVectorize/single_early_exit.ll

+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true)


Yes, can fold in VPlan simplification, as follow-up or included here?

fhahn · 2024-12-10T13:42:35Z

llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll

+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch


This should be covered in single_early_exit.ll

fhahn · 2024-12-10T13:42:53Z

llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll

+
+define i64 @multi_exiting_to_different_exits_load_exit_value() {
+; CHECK: multi_exiting_to_different_exits_load_exit_value
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {


Removed here, covered in the non-vplan tests.

ayalz · 2024-12-10T17:48:15Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  // 4. Adjust branch weight of the branch in the middle block if it exists.


Thanks for adding "if it exists". Comment also meant to question when would the middle block not exist? Should this refer to cases where the middle block (which conceptually is always there, after vector loop and whatever comes next (exit(s) and/or scalar loop)), rather than being a single block (that ends with a conditional or unconditional branch), is split into multiple blocks as it has more than two successors (rather than targeting them all with a switch)?
A null ExitVPBB, OTOH, corresponds to a scalar preheader having more than one predecessor, i.e., it also has runtime guards as predecessors.

Ah yes, restored the original code, always setting the branch weights for the middle block which contains the branch on the trip count

ayalz · 2024-12-10T18:18:16Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+  VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+  VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
+  VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
+  std::swap(NewMiddle->getSuccessors()[0], NewMiddle->getSuccessors()[1]);


Better via VPBlockUtils::swapSuccessors()

Done, thanks

ayalz · 2024-12-10T18:19:36Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+  // with one exiting if either the original condition of the vector latch is
+  // true or the early exit has been taken.
+  auto *LatchExitingBranch =
+      dyn_cast<VPInstruction>(LatchVPBB->getTerminator());


Suggested change

dyn_cast<VPInstruction>(LatchVPBB->getTerminator());

cast<VPInstruction>(LatchVPBB->getTerminator());

Also good to assert the opcode of LatchExitingBranch, before taking its operands.

Done, thanks

ayalz · 2024-12-10T18:20:04Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+  auto *IsLatchExitTaken =
+      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+                         LatchExitingBranch->getOperand(1));
+  auto *AnyExiting = Builder.createNaryOp(Instruction::Or,


Suggested change

auto *AnyExiting = Builder.createNaryOp(Instruction::Or,

auto *AnyExitTaken = Builder.createNaryOp(Instruction::Or,

Done, thanks

ayalz · 2024-12-10T18:22:42Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

+  /// UncountableExitingBlock by
+  ///  * updating the condition exiting the vector loop to include the early
+  ///    exit conditions
+  ///  * splitting the original middle block to branch to the early exit blocks


Suggested change

/// * splitting the original middle block to branch to the early exit blocks

/// * splitting the original middle block to branch to the early exit block

updated thanks

This split off changes for more complex CFGs in VPlan from both llvm#114292 llvm#112138 This simplifies their respective diffs.

This prepares for more complex CFGs in VPlan, as in llvm#114292 llvm#112138

Use getUniqueLatchExitBlock instead of getUniqueExitBlock in preparation for multi-exit vectorization *without* requiring a scalar epilogue. Split off as suggested from llvm#112138

Adjust the assertion in fixupIVUsers to only require a unique exit block if there are any values to fix up. This enables the bring up of multi-exit loop vectorization without requiring a scalar epilogue. Split off as suggested from llvm#112138.

…-cond-in-middle

Once PR llvm#112138 lands we are able to start vectorising more loops that have uncountable early exits. The typical loop structure looks like this: vector.body: ... %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat ... %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred) %iv.cmp = icmp eq i64 %index.next, 4 %exit.cond = or i1 %or.reduc, %iv.cmp br i1 %exit.cond, label %middle.split, label %vector.body middle.split: br i1 %or.reduc, label %found, label %notfound found: ret i64 1 notfound: ret i64 0 The problem with this is that %or.reduc is kept live after the loop, and since this is a boolean it typically requires making a copy of the condition code register. For AArch64 this requires an additional cset instruction, which is quite expensive for a typical find loop that only contains 6 or 7 instructions. This patch attempts to improve the codegen by sinking the reduction out of the loop to the location of it's user. It's a lot cheaper to keep the predicate alive if the type is legal and has lots of registers for it. There is a potential downside in that a little more work is required after the loop, but I believe this is worth it since we are likely to spend most of our time in the loop.

david-arm · 2024-12-11T16:07:02Z

Updated, it required updating getUncountableEarlyExitingBlock to return nullptr in if !UncountableEarlyExit and some slight adjustments on how it is set (and clear the entries if it is false).

WDYT @david-arm ?

Sorry I only saw this now. It was hard to see amongst all the other comments. The original idea behind getUncountableEarlyExitingBlock was that you should only be calling this if you know you have an uncountable early exit. That's what the existing assert

    assert(getUncountableExitingBlocks().size() == 1 &&
           "Expected only a single uncountable exiting block");

was doing. It's pretty difficult to parse all of the comment history in github - is the reason for the change because you want to avoid calling hasUncountableEarlyExit and are doing this as a shortcut? That seems fine.

PR llvm#112138 introduced initial support for dispatching to multiple exit blocks via split middle blocks. This patch fixes a few issues so that we can enable more tests to use the new enable-early-exit-vectorization flag. Fixes are: 1. The code to bail out for any loop live-out values happens too late. This is because collectUsersInExitBlocks ignores induction variables, which get dealt with in fixupIVUsers. I've moved the check much earlier in processLoop by looking for outside users of loop-defined values. 2. We shouldn't yet be interleaving when vectorising loops with uncountable early exits, since we've not added support for this yet. 3. Similarly, we also shouldn't be creating vector epilogues. 4. Similarly, we shouldn't enable tail-folding. 5. The existing implementation doesn't yet support loops that require scalar epilogues, although I plan to add that as part of PR llvm#88385. 6. The new split middle blocks weren't being added to the parent loop.

fhahn · 2024-12-11T21:10:27Z

was doing. It's pretty difficult to parse all of the comment history in github - is the reason for the change because you want to avoid calling hasUncountableEarlyExit and are doing this as a shortcut? That seems fine.

Yep exactly, it is in some places more convenient to have a single call that returns nullptr if there's no vectorizable early exit block

david-arm · 2024-12-12T09:17:46Z

Many thanks for all your work on this patch @fhahn!

PR llvm#112138 introduced initial support for dispatching to multiple exit blocks via split middle blocks. This patch fixes a few issues so that we can enable more tests to use the new enable-early-exit-vectorization flag. Fixes are: 1. The code to bail out for any loop live-out values happens too late. This is because collectUsersInExitBlocks ignores induction variables, which get dealt with in fixupIVUsers. I've moved the check much earlier in processLoop by looking for outside users of loop-defined values. 2. We shouldn't yet be interleaving when vectorising loops with uncountable early exits, since we've not added support for this yet. 3. Similarly, we also shouldn't be creating vector epilogues. 4. Similarly, we shouldn't enable tail-folding. 5. The existing implementation doesn't yet support loops that require scalar epilogues, although I plan to add that as part of PR llvm#88385. 6. The new split middle blocks weren't being added to the parent loop.

PR #112138 introduced initial support for dispatching to multiple exit blocks via split middle blocks. This patch fixes a few issues so that we can enable more tests to use the new enable-early-exit-vectorization flag. Fixes are: 1. The code to bail out for any loop live-out values happens too late. This is because collectUsersInExitBlocks ignores induction variables, which get dealt with in fixupIVUsers. I've moved the check much earlier in processLoop by looking for outside users of loop-defined values. 2. We shouldn't yet be interleaving when vectorising loops with uncountable early exits, since we've not added support for this yet. 3. Similarly, we also shouldn't be creating vector epilogues. 4. Similarly, we shouldn't enable tail-folding. 5. The existing implementation doesn't yet support loops that require scalar epilogues, although I plan to add that as part of PR #88385. 6. The new split middle blocks weren't being added to the parent loop.

fhahn mentioned this pull request Oct 13, 2024

[VPlan] Introduce multi-branch recipe, use for multi-exit loops (WIP). #109193

Closed

[VPlan] Support VPIRBBs and VPIRInst phis with multiple predecessors.

245b56a

fhahn force-pushed the vplan-branch-on-multi-cond-in-middle branch from e4c27f0 to fd55cd8 Compare October 22, 2024 02:11

fhahn marked this pull request as ready for review October 22, 2024 02:15

fhahn requested a review from ayalz October 22, 2024 02:15

llvmbot added vectorizers llvm:transforms labels Oct 22, 2024

fhahn requested review from rengolin, aniragil and david-arm October 22, 2024 02:16

[VPlan] Dispatch to multiple exit blocks via middle blocks.

47258de

A more lightweight variant of llvm#109193, which dispatches to multiple exit blocks via the middle blocks.

fhahn force-pushed the vplan-branch-on-multi-cond-in-middle branch from fd55cd8 to 47258de Compare October 22, 2024 02:16

david-arm reviewed Oct 22, 2024

View reviewed changes

david-arm requested a review from huntergr-arm October 22, 2024 11:00

david-arm reviewed Oct 23, 2024

View reviewed changes

david-arm mentioned this pull request Oct 24, 2024

[VPlan][NFC] Add new getMiddleBlock interface to VPlan #113558

Merged

fhahn added 2 commits October 31, 2024 17:30

Merge remote-tracking branch 'origin/main' into vplan-branch-on-multi…

9265fb1

…-cond-in-middle

!fixup address first set of comments, thanks!

3831acb

fhahn commented Oct 31, 2024

View reviewed changes

fhahn commented Dec 10, 2024

View reviewed changes

ayalz reviewed Dec 10, 2024

View reviewed changes

broxigarchen pushed a commit to broxigarchen/llvm-project that referenced this pull request Dec 10, 2024

[VPlan] Use RPOT for VPlan codegen and printing.

92cd68c

This split off changes for more complex CFGs in VPlan from both llvm#114292 llvm#112138 This simplifies their respective diffs.

broxigarchen pushed a commit to broxigarchen/llvm-project that referenced this pull request Dec 10, 2024

[VPlan] Iterate over blocks in VPlan::execute in RPOT (NFC).

d38559b

This prepares for more complex CFGs in VPlan, as in llvm#114292 llvm#112138

Merge remote-tracking branch 'origin/main' into vplan-branch-on-multi…

65d0288

…-cond-in-middle

david-arm mentioned this pull request Dec 11, 2024

[AArch64] Improve codegen of vectorised early exit loops #119534

Open

!fixup address latest comments, thanks!

8d04383

david-arm approved these changes Dec 11, 2024

View reviewed changes

david-arm added a commit to david-arm/llvm-project that referenced this pull request Dec 11, 2024

[VPlan] Dispatch to multiple exit blocks via middle blocks. llvm#112138

0807837

fhahn merged commit 5fae408 into llvm:main Dec 11, 2024
9 checks passed

fhahn deleted the vplan-branch-on-multi-cond-in-middle branch December 11, 2024 21:11

fhahn restored the vplan-branch-on-multi-cond-in-middle branch December 11, 2024 21:11

fhahn deleted the vplan-branch-on-multi-cond-in-middle branch December 11, 2024 21:11

fhahn added a commit to fhahn/llvm-project that referenced this pull request Dec 12, 2024

!fixup update after merging llvm#112138

a0af583

fhahn mentioned this pull request Dec 12, 2024

[VPlan] Hook IR blocks into VPlan during skeleton creation (NFC) #114292

Merged

		assert((MissingVals.empty() \|\| OrigLoop->getUniqueExitBlock()) &&
		"Expected a single exit block for escaping values");

	dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
	cast<VPInstruction>(LatchVPBB->getTerminator());

	auto *AnyExiting = Builder.createNaryOp(Instruction::Or,
	auto *AnyExitTaken = Builder.createNaryOp(Instruction::Or,

	/// * splitting the original middle block to branch to the early exit blocks
	/// * splitting the original middle block to branch to the early exit block

[VPlan] Dispatch to multiple exit blocks via middle blocks. #112138

[VPlan] Dispatch to multiple exit blocks via middle blocks. #112138

Conversation

fhahn commented Oct 13, 2024 • edited Loading

github-actions bot commented Oct 22, 2024 • edited Loading

llvmbot commented Oct 22, 2024

llvmbot commented Oct 22, 2024

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fhahn commented Oct 28, 2024

fhahn left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

david-arm commented Nov 1, 2024

fhahn left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayalz Dec 10, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayalz Dec 10, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fhahn commented Oct 13, 2024 •

edited

Loading

github-actions bot commented Oct 22, 2024 •

edited

Loading

ayalz Dec 10, 2024 •

edited

Loading

ayalz Dec 10, 2024 •

edited

Loading