From 47258deea863675e43fd7fd48376dce131441dc5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH] [VPlan] Dispatch to multiple exit blocks via middle blocks.

A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
---
 .../Vectorize/LoopVectorizationLegality.h     |   3 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  29 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  82 +++---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  39 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   1 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  16 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  82 ++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   8 -
 .../LoopVectorize/X86/multi-exit-codegen.ll   | 240 ++++++++++++++++++
 .../LoopVectorize/X86/multi-exit-cost.ll      |  18 +-
 .../LoopVectorize/X86/multi-exit-vplan.ll     | 148 +++++++++++
 12 files changed, 614 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
+  /// Returns true if the loop has an early exit that we can vectorize.
+  bool canVectorizeEarlyExit() const;
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
+static cl::opt<bool>
+    EnableEarlyExitVectorization("enable-early-exit-vectorization",
+                                 cl::init(false), cl::Hidden, cl::desc(""));
+
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  // When vectorizing early exits, create predicates for all blocks, except the
+  // header.
+  if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+    return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+  // Currently only allow vectorizing loops with early exits, if early-exit
+  // vectorization is explicitly enabled and the loop has metadata to force
+  // vectorization.
+  if (!EnableEarlyExitVectorization)
+    return false;
+
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  if (Exiting.size() == 1)
+    return false;
+
+  LoopVectorizeHints Hints(TheLoop, true, *ORE);
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = TheLoop->getHeader()->getParent();
+  return Hints.allowVectorization(Fn, TheLoop,
+                                  true /*VectorizeOnlyWhenForced*/);
+}
+
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..befe8f7c0076a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeEarlyExit()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+          Legal->canVectorizeEarlyExit()) &&
          "multiple exit loop without required epilogue?");
 
   LoopMiddleBlock =
@@ -2758,8 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
   DenseMap<Value *, Value *> MissingVals;
 
   // An external user of the last iteration's value should see the value that
@@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
   }
+
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block");
 }
 
 namespace {
@@ -3599,7 +3603,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -7692,12 +7697,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+    ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+    for (VPRecipeBase &R : *ExitVPBB) {
+      createAndCollectMergePhiForReduction(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
   }
 
   // 2.6. Maintain Loop Hints
@@ -7723,6 +7731,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
+
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7744,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (ExitVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -8128,7 +8139,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
+  if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())
+    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9161,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
-  addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  addUsersInExitBlock(*Plan, ExitUsersToFix);
+  if (Legal->canVectorizeEarlyExit()) {
+    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+                                        RecipeBuilder);
+  } else {
+    SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+        OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+    addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+    addUsersInExitBlock(*Plan, ExitUsersToFix);
+  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9297,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9315,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(UserRecipe->getParent() == MiddleVPBB &&
-                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())
+    return;
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8609514c39e7d0..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&
+        PredVPBlock == getPlan()->getVectorLoopRegion() &&
+        PredVPBlock->getNumSuccessors()) {
+      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+      PredBB = TermBr->getSuccessor(1);
+      TermBr = cast<BranchInst>(PredBB->getTerminator());
+      idx = 0;
+    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-  // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (IRExitBlock) {
+    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+    // The connection order corresponds to the operands of the conditional
+    // branch.
+    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  }
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1
+          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
       MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
   assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
          "scalar preheader cannot be wrapped already");
+  if (ScalarPhVPBB->getNumSuccessors() != 0) {
+    ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+    MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+  }
   replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
   replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
@@ -1065,6 +1083,10 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
+  if (!getVectorLoopRegion()->getSingleSuccessor())
+    VectorLatchBB =
+        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1091,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+        Inc->moveBefore(VectorLatchBB->getTerminator());
+      else
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    AnyOf,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index be3e958320e771..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::AnyOf:
     return true;
   default:
     return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::AnyOf: {
+    Value *A = State.get(getOperand(0));
+    return Builder.CreateOrReduce(A);
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::AnyOf;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return false;
   case Instruction::ICmp:
   case Instruction::Select:
+  case Instruction::Or:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::AnyOf:
+    O << "any-of";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..a86498eb9aa30c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
 
+  for (VPRecipeBase &R : make_early_inc_range(
+           reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+    if (isDeadRecipe(R))
+      R.eraseFromParent();
+  }
+
   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
@@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                         Loop *OrigLoop,
+                                         VPRecipeBuilder &RecipeBuilder) {
+  auto *LatchVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *MiddleVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+
+  const SCEV *BackedgeTakenCount =
+      SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  const SCEV *TripCount = SE.getTripCountFromExitCount(
+      BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+  VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+  Plan.getTripCount()->replaceAllUsesWith(NewTC);
+  Plan.resetTripCount(NewTC);
+
+  VPValue *EarlyExitTaken = nullptr;
+  SmallVector<BasicBlock *> ExitingBBs;
+  OrigLoop->getExitingBlocks(ExitingBBs);
+  for (BasicBlock *Exiting : ExitingBBs) {
+    auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+    BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+    BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+    VPIRBasicBlock *VPExitBlock;
+    if (OrigLoop->getUniqueExitBlock())
+      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+    else
+      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+    for (VPRecipeBase &R : *VPExitBlock) {
+      auto *ExitIRI = cast<VPIRInstruction>(&R);
+      auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+      if (!ExitPhi)
+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+      VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+      ExitIRI->addOperand(V);
+    }
+
+    if (Exiting == OrigLoop->getLoopLatch()) {
+      if (MiddleVPBB->getNumSuccessors() == 0) {
+        VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+        VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+        VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+      }
+      continue;
+    }
+
+    VPValue *M = RecipeBuilder.getBlockInMask(
+        OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    auto *N = Builder.createNot(M);
+    EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+
+    VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+    VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+    VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+    VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+    VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+
+    VPBuilder MiddleBuilder(NewMiddle);
+    MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
+    // MiddleVPBB = NewMiddle;
+  }
+  auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+  auto *IsLatchExiting = Builder.createICmp(
+      CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+  auto *AnyExiting =
+      Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting});
+  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting);
+  Term->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..9745211db275f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -123,6 +123,10 @@ struct VPlanTransforms {
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
+
+  static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                 Loop *OrigLoop,
+                                 VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ea5ee341cc547..1ac79f8887ab46 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     return false;
   }
 
-  VPBlockBase *MiddleBB =
-      IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
-  if (IRBB != IRBB->getPlan()->getPreheader() &&
-      IRBB->getSinglePredecessor() != MiddleBB) {
-    errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
-              "middle-block at the moment!\n";
-    return false;
-  }
   return true;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..0c33715c6bd271
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT:  [[ENTRY:.*]]:
+; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI:       [[VECTOR_PH]]:
+; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MULTI:       [[VECTOR_BODY]]:
+; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI:       [[MIDDLE_SPLIT]]:
+; MULTI-NEXT:    br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI:       [[MIDDLE_BLOCK]]:
+; MULTI-NEXT:    br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI:       [[SCALAR_PH]]:
+; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
+; MULTI:       [[LOOP_HEADER]]:
+; MULTI-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT:    br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI:       [[LOOP_LATCH]]:
+; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT:    br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; MULTI:       [[E1]]:
+; MULTI-NEXT:    ret i64 0
+; MULTI:       [[E2]]:
+; MULTI-NEXT:    ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
+; DEFAULT:       [[SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
+; DEFAULT:       [[LOOP_HEADER]]:
+; DEFAULT-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT:    br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT:       [[LOOP_LATCH]]:
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT:    br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       [[E1]]:
+; DEFAULT-NEXT:    ret i64 0
+; DEFAULT:       [[E2]]:
+; DEFAULT-NEXT:    ret i64 1
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+  ret i64 0
+
+e2:
+  ret i64 1
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; MULTI-NEXT:  [[ENTRY:.*]]:
+; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI:       [[VECTOR_PH]]:
+; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MULTI:       [[VECTOR_BODY]]:
+; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; MULTI:       [[MIDDLE_SPLIT]]:
+; MULTI-NEXT:    br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI:       [[MIDDLE_BLOCK]]:
+; MULTI-NEXT:    br i1 true, label %[[E]], label %[[SCALAR_PH]]
+; MULTI:       [[SCALAR_PH]]:
+; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
+; MULTI:       [[LOOP_HEADER]]:
+; MULTI-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; MULTI-NEXT:    br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
+; MULTI:       [[LOOP_LATCH]]:
+; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
+; MULTI-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; MULTI:       [[E]]:
+; MULTI-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
+; MULTI-NEXT:    ret i64 [[P1]]
+;
+; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
+; DEFAULT:       [[SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
+; DEFAULT:       [[LOOP_HEADER]]:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
+; DEFAULT:       [[LOOP_LATCH]]:
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
+; DEFAULT-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       [[E]]:
+; DEFAULT-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
+; DEFAULT-NEXT:    ret i64 [[P1]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT:    [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..5c5d532b93bc89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT:    IR   %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}