diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index d560ca648132c9..0fa13e2a3d0e15 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -294,6 +294,10 @@ template class LoopBase { /// Otherwise return null. BlockT *getUniqueExitBlock() const; + /// Return the unique exit block for the latch, or null if there are multiple + /// different exit blocks. + BlockT *getUniqueLatchExitBlock() const; + /// Return true if this loop does not have any exit blocks. bool hasNoExitBlocks() const; diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index d19022729ace32..4945ea30950d23 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -159,6 +159,16 @@ BlockT *LoopBase::getUniqueExitBlock() const { return getExitBlockHelper(this, true).first; } +template +BlockT *LoopBase::getUniqueLatchExitBlock() const { + const BlockT *Latch = getLoopLatch(); + assert(Latch && "Latch block must exists"); + SmallVector ExitBlocks; + getUniqueExitBlocksHelper(this, ExitBlocks, + [Latch](const BlockT *BB) { return BB == Latch; }); + return ExitBlocks.size() == 1 ? ExitBlocks[0] : nullptr; +} + /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_). template void LoopBase::getExitEdges( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 43be72f0f34d45..f16e32b1f2d9cd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -83,6 +83,12 @@ static cl::opt EnableHistogramVectorization( "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, cl::desc("Enables autovectorization of some loops containing histograms")); +static cl::opt AssumeNoMemFault( + "vectorizer-no-mem-fault", cl::init(false), cl::Hidden, + cl::desc("Assume vectorized loops will not have memory faults, which is " + "potentially unsafe but can be useful for testing vectorization " + "of early exit loops.")); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -1710,11 +1716,15 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { Predicates.clear(); if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates)) { - reportVectorizationFailure( - "Loop may fault", - "Cannot vectorize potentially faulting early exit loop", - "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); - return false; + if (!AssumeNoMemFault) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } else + LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not " + << "fault\n"); } [[maybe_unused]] const SCEV *SymbolicMaxBTC = diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 001c8987667df8..1f8b074208bacd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -178,6 +178,10 @@ static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops.")); +static cl::opt EnableEarlyExitVectorization( + "enable-early-exit-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of early exit loops.")); + static cl::opt EpilogueVectorizationForceVF( "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " @@ -547,6 +551,10 @@ class InnerLoopVectorizer { BasicBlock *MiddleBlock, VPlan &Plan, VPTransformState &State); + void fixupEarlyExitIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + BasicBlock *VectorEarlyExitBB, VPlan &Plan, + VPTransformState &State); + /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -637,10 +645,6 @@ class InnerLoopVectorizer { /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; - /// The unique ExitBlock of the scalar loop if one exists. Note that - /// there can be multiple exiting edges reaching this block. - BasicBlock *LoopExitBlock; - /// The scalar loop body. BasicBlock *LoopScalarBody; @@ -1362,11 +1366,28 @@ class LoopVectorizationCostModel { } // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + if (!Legal->hasUncountableEarlyExit() && + TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { LLVM_DEBUG( dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); return true; } + // If this is a loop with a uncountable early exit, then we may validly + // exit from a non-latch block and not require a scalar epilogue for the + // last iteration, since these exits are handled specially. However, since + // we could have both countable and uncountable exits we must search all + // the exits. + if (Legal->hasUncountableEarlyExit()) { + const SmallVector &CountableExitingBlocks = + Legal->getCountableExitingBlocks(); + unsigned NumBlocks = CountableExitingBlocks.size(); + if (NumBlocks > 1 || (NumBlocks == 1 && CountableExitingBlocks[0] != + TheLoop->getLoopLatch())) { + LLVM_DEBUG( + dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); + return true; + } + } if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " "interleaved group requires scalar epilogue\n"); @@ -2004,8 +2025,7 @@ class GeneratedRTChecks { /// adjusts the branches to branch to the vector preheader or \p Bypass, /// depending on the generated condition. BasicBlock *emitSCEVChecks(BasicBlock *Bypass, - BasicBlock *LoopVectorPreHeader, - BasicBlock *LoopExitBlock) { + BasicBlock *LoopVectorPreHeader) { if (!SCEVCheckCond) return nullptr; @@ -2478,7 +2498,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { BasicBlock *const SCEVCheckBlock = - RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); + RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader); if (!SCEVCheckBlock) return nullptr; @@ -2533,8 +2553,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); - LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr - assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && + assert((OrigLoop->getUniqueLatchExitBlock() || + Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?"); LoopMiddleBlock = @@ -2717,18 +2737,39 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); + assert((OrigLoop->getUniqueExitBlock() || Legal->hasUncountableEarlyExit()) && + "Expected a single exit block"); DenseMap MissingVals; + BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch(); + auto IsUseFromUncountableExit = [&](Value *V, Instruction *UI) -> bool { + PHINode *PHI = dyn_cast(UI); + assert(PHI && "Expected LCSSA form"); + if (!Legal->hasUncountableEarlyExit()) + return false; + + // If this loop has an uncountable early exit then there could be a + // user of OrigPhi with either: + // 1. Multiple uses, because each exiting block (countable or + // uncountable) jumps to the same exit block, or .. + // 2. A single use with an incoming value from an uncountable exit + // block. + // In both cases there is no guarantee this came from a normal, countable + // exit. Currently if a loop has an uncountable early exit then it must + // have a latch with a countable exit. + int Index = PHI->getBasicBlockIndex(OrigLoopLatch); + return (Index == -1 || PHI->getIncomingValue(Index) != V); + }; + // An external user of the last iteration's value should see the value that // the remainder loop uses to initialize its own IV. - Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); + Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch); for (User *U : PostInc->users()) { Instruction *UI = cast(U); if (!OrigLoop->contains(UI)) { - assert(isa(UI) && "Expected LCSSA form"); - MissingVals[UI] = EndValue; + if (!IsUseFromUncountableExit(PostInc, UI)) + MissingVals[cast(UI)] = EndValue; } } @@ -2738,7 +2779,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, for (User *U : OrigPhi->users()) { auto *UI = cast(U); if (!OrigLoop->contains(UI)) { - assert(isa(UI) && "Expected LCSSA form"); + if (IsUseFromUncountableExit(OrigPhi, UI)) + continue; + IRBuilder<> B(MiddleBlock->getTerminator()); // Fast-math-flags propagate from the original induction instruction. @@ -2773,6 +2816,95 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, } } +void InnerLoopVectorizer::fixupEarlyExitIVUsers(PHINode *OrigPhi, + const InductionDescriptor &II, + BasicBlock *VectorEarlyExitBB, + VPlan &Plan, + VPTransformState &State) { + // There are two kinds of external IV usages - those that use the value + // computed in the last iteration (the PHI) and those that use the penultimate + // value (the value that feeds into the phi from the loop latch). + // We allow both, but they, obviously, have different values. + DenseMap MissingVals; + BasicBlock *OrigEarlyExitingBlock = Legal->getUncountableEarlyExitingBlock(); + BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch(); + Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch); + + auto FixUpPhi = [&](Instruction *UI, bool PostInc) -> Value * { + IRBuilder<> B(VectorEarlyExitBB->getTerminator()); + assert(isa(UI) && "Expected LCSSA form"); + + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + + // We need the mask that led us into the early exit block. + Value *EarlyExitMask = + State.get(Plan.getVectorLoopRegion()->getVectorEarlyExitCond()); + VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + Type *CtzType = CanonicalIV->getStartValue()->getLiveInIRValue()->getType(); + Value *Ctz; + if (EarlyExitMask) + Ctz = B.CreateCountTrailingZeroElems(CtzType, EarlyExitMask); + else + Ctz = ConstantInt::get(CtzType, 0); + Ctz = B.CreateAdd(Ctz, cast(State.get( + CanonicalIV->getVPSingleValue(), VPLane(0)))); + if (PostInc) + Ctz = B.CreateAdd(Ctz, ConstantInt::get(CtzType, 1)); + + Value *Escape = nullptr; + VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); + assert(StepVPV && "step must have been expanded during VPlan execution"); + Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() + : State.get(StepVPV, VPLane(0)); + Escape = emitTransformedIndex(B, Ctz, II.getStartValue(), Step, + II.getKind(), II.getInductionBinOp()); + Escape->setName("ind.early.escape"); + + return Escape; + }; + + const Loop *L = this->OrigLoop; + auto isUsedInEarlyExitBlock = + [&L, &OrigEarlyExitingBlock](Value *V, Instruction *UI) -> bool { + if (!L->contains(UI)) { + PHINode *PHI = dyn_cast(UI); + assert(PHI && "Expected LCSSA form"); + int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock); + if (Index != -1 && PHI->getIncomingValue(Index) == V) + return true; + } + return false; + }; + + for (User *U : PostInc->users()) { + auto *UI = cast(U); + if (isUsedInEarlyExitBlock(PostInc, UI)) + MissingVals[UI] = FixUpPhi(UI, true); + } + + for (User *U : OrigPhi->users()) { + auto *UI = cast(U); + if (isUsedInEarlyExitBlock(OrigPhi, UI)) + MissingVals[UI] = FixUpPhi(UI, false); + } + + VPBasicBlock *EarlyExitVPBB = Plan.getVectorLoopRegion()->getEarlyExit(); + for (auto &I : MissingVals) { + PHINode *PHI = cast(I.first); + // One corner case we have to handle is two IVs "chasing" each-other, + // that is %IV2 = phi [...], [ %IV1, %latch ] + // In this case, if IV1 has an external use, we need to avoid adding both + // "last value of IV1" and "penultimate value of IV2". So, verify that we + // don't already have an incoming value for the middle block. + if (PHI->getBasicBlockIndex(VectorEarlyExitBB) == -1) { + PHI->addIncoming(I.second, VectorEarlyExitBB); + Plan.removeLiveOut(PHI, EarlyExitVPBB); + } + } +} + namespace { struct CSEDenseMapInfo { @@ -2902,6 +3034,21 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); + if (VectorRegion->getEarlyExit()) { + // Fix-up external users of the induction variables. + VPBasicBlock *VectorEarlyExitVPBB = + cast(VectorRegion->getEarlyExit()); + BasicBlock *VectorEarlyExitBB = State.CFG.VPBB2IRBB[VectorEarlyExitVPBB]; + for (const auto &Entry : Legal->getInductionVars()) + fixupEarlyExitIVUsers(Entry.first, Entry.second, VectorEarlyExitBB, Plan, + State); + + BasicBlock *OrigEarlyExitBB = Legal->getUncountableEarlyExitBlock(); + if (Loop *EEL = LI->getLoopFor(OrigEarlyExitBB)) + EEL->addBasicBlockToLoop(VectorEarlyExitBB, *LI); + } + if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming @@ -2928,7 +3075,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; @@ -3546,9 +3692,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { SmallVector Exiting; TheLoop->getExitingBlocks(Exiting); for (BasicBlock *E : Exiting) { - auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) - AddToWorklistIfAllowed(Cmp); + if (!Legal->hasUncountableEarlyExit() || + E != Legal->getUncountableEarlyExitingBlock()) { + auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + AddToWorklistIfAllowed(Cmp); + } } auto PrevVF = VF.divideCoefficientBy(2); @@ -3998,7 +4147,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // a bottom-test and a single exiting block. We'd have to handle the fact // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + if (Legal->hasUncountableEarlyExit() || + TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { @@ -4603,7 +4753,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) + // TODO: Add support for loops with an early exit. + if (Legal->hasUncountableEarlyExit() || + OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; @@ -4839,6 +4991,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; + // We don't attempt to perform interleaving for early exit loops. + if (Legal->hasUncountableEarlyExit()) + return 1; + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); @@ -6425,9 +6581,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); - - // This branch will be eliminated by if-conversion. - return 0; + else if (Legal->hasUncountableEarlyExit() && + I->getParent() == Legal->getUncountableEarlyExitingBlock()) { + // In order to determine whether we take an early exit or not we have to + // perform an or reduction of the vector predicate. + auto *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); + InstructionCost EECost = TTI.getArithmeticReductionCost( + Instruction::Or, Vec_i1Ty, std::nullopt, CostKind); + // Add on the cost of the conditional branch, which will remain. + EECost += TTI.getCFInstrCost(Instruction::Br, CostKind); + // TODO: The vector loop early exit block also needs to do work to + // determine the first lane that triggered the exit. We should probably + // add that somehow, but the cost will be negligible for long loops. + return EECost; + } else + // This branch will be eliminated by if-conversion. + return 0; // Note: We currently assume zero cost for an unconditional branch inside // a predicated block since it will become a fall-through, although we // may decide in the future to call TTI for all branches. @@ -7531,6 +7701,9 @@ DenseMap LoopVectorizationPlanner::executePlan( State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); BestVPlan.getPreheader()->execute(&State); } + if (Legal->hasUncountableEarlyExit()) + State.CFG.EarlyExitBB = Legal->getUncountableEarlyExitBlock(); + if (!ILV.getTripCount()) ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); else @@ -7584,7 +7757,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + cast(BestVPlan.getVectorLoopRegion()->getSuccessors()[0]); for (VPRecipeBase &R : *ExitVPBB) { createAndCollectMergePhiForReduction( dyn_cast(&R), State, OrigLoop, @@ -7807,7 +7980,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. - DT->changeImmediateDominator(LoopExitBlock, + DT->changeImmediateDominator(OrigLoop->getUniqueLatchExitBlock(), EPI.EpilogueIterationCountCheck); // Keep track of bypass blocks, as they feed start values to the induction and @@ -8672,7 +8845,7 @@ static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + cast(Plan.getVectorLoopRegion()->getSuccessors()[0]); // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. @@ -8680,7 +8853,12 @@ static SetVector collectUsersInExitBlock( return {}; SetVector ExitUsersToFix; VPBasicBlock *ExitVPBB = cast(MiddleVPBB->getSuccessors()[0]); - BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); + BasicBlock *ExitingBB; + if (Plan.getVectorLoopRegion()->getEarlyExit()) + ExitingBB = OrigLoop->getLoopLatch(); + else + ExitingBB = OrigLoop->getExitingBlock(); + for (VPRecipeBase &R : *ExitVPBB) { auto *ExitIRI = dyn_cast(&R); if (!ExitIRI) @@ -8719,7 +8897,7 @@ addUsersInExitBlock(VPlan &Plan, return; auto *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + cast(Plan.getVectorLoopRegion()->getSuccessors()[0]); BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); @@ -8758,7 +8936,7 @@ static void addLiveOutsForFirstOrderRecurrences( // TODO: Should be replaced by // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the // scalar region is modeled as well. - auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); + auto *MiddleVPBB = cast(VectorRegion->getSuccessors()[0]); VPBasicBlock *ScalarPHVPBB = nullptr; if (MiddleVPBB->getNumSuccessors() == 2) { // Order is strict: first is the exit block, second is the scalar preheader. @@ -8784,6 +8962,8 @@ static void addLiveOutsForFirstOrderRecurrences( if (!FOR) continue; + assert(VectorRegion->getNumSuccessors() == 1 && + "Cannot handle multiple successors"); // This is the second phase of vectorizing first-order recurrences, creating // extract for users outside the loop. An overview of the transformation is // described below. Suppose we have the following loop with some use after @@ -8876,6 +9056,59 @@ static void addLiveOutsForFirstOrderRecurrences( } } +// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the +// original exit block. +static void addUsersInEarlyExitBlock(BasicBlock *EarlyExitingBB, + BasicBlock *EarlyExitBB, + VPRecipeBuilder &RecipeBuilder, + VPlan &Plan) { + VPBasicBlock *EarlyExitVPBB = Plan.getVectorLoopRegion()->getEarlyExit(); + VPBuilder B(EarlyExitVPBB); + for (PHINode &ExitPhi : EarlyExitBB->phis()) { + Value *IncomingValue = ExitPhi.getIncomingValueForBlock(EarlyExitingBB); + VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue); + VPValue *EarlyExitMask = + Plan.getVectorLoopRegion()->getVectorEarlyExitCond(); + VPValue *Ext = + B.createNaryOp(VPInstruction::ExtractHighestActive, {V, EarlyExitMask}); + + Plan.addLiveOut(&ExitPhi, Ext, EarlyExitVPBB); + } +} + +static VPValue *getConditionForVectorEarlyExit(Loop *OrigLoop, + BasicBlock *ExitingBB, + VPlan &Plan, VPBuilder &Builder, + VPRecipeBuilder &RecipeBuilder, + VPRecipeBase *VPEarlyExitCond) { + // To make things easier we canonicalise the condition so that 'true' + // means take the early exit. + auto *BI = cast(ExitingBB->getTerminator()); + + // If the true destination is in the loop then we want to invert the + // condition so that true means early exit. + bool NeedsInvert = OrigLoop->contains(BI->getSuccessor(0)); + + VPValue *ScalarExitCond; + if (!VPEarlyExitCond) { + // If we didn't find the exit condition, then this must have been + // defined outside the loop and is loop invariant. + ScalarExitCond = RecipeBuilder.getVPValueOrAddLiveIn(BI->getCondition()); + if (NeedsInvert) + ScalarExitCond = Builder.createNot(ScalarExitCond); + Plan.getVectorLoopRegion()->setVectorEarlyExitCond(ScalarExitCond); + } else { + VPValue *EarlyExitMask = VPEarlyExitCond->getVPSingleValue(); + if (NeedsInvert) + EarlyExitMask = Builder.createNot(EarlyExitMask); + Plan.getVectorLoopRegion()->setVectorEarlyExitCond(EarlyExitMask); + // If any lane of EarlyExitMask would be true we should exit the loop. + ScalarExitCond = + Builder.createNaryOp(VPInstruction::OrReduction, {EarlyExitMask}); + } + return ScalarExitCond; +} + VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { @@ -8898,9 +9131,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), - PSE, RequiresScalarEpilogueCheck, - CM.foldTailByMasking(), OrigLoop); + VPlanPtr Plan = VPlan::createInitialVPlan( + Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, + CM.foldTailByMasking(), OrigLoop, Legal->hasUncountableEarlyExit()); // Don't use getDecisionAndClampRange here, because we don't know the UF // so this function is better to be conservative, rather than to split @@ -8964,8 +9197,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { return Legal->blockNeedsPredication(BB) || NeedsBlends; }); auto *MiddleVPBB = - cast(Plan->getVectorLoopRegion()->getSingleSuccessor()); + cast(Plan->getVectorLoopRegion()->getSuccessors()[0]); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); + + // If we find the recipe for the early exit condition we need to record it + // so that we can then generate the new vector exit condition. + VPRecipeBase *VPEarlyExitCond = nullptr; + Value *EarlyExitCond = nullptr; + BasicBlock *EarlyExitingBB = nullptr; + if (Legal->hasUncountableEarlyExit()) { + EarlyExitingBB = Legal->getUncountableEarlyExitingBlock(); + BranchInst *BI = cast(EarlyExitingBB->getTerminator()); + EarlyExitCond = BI->getCondition(); + } + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -9013,6 +9258,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { if (!Recipe) Recipe = RecipeBuilder.handleReplication(Instr, Range); + if (&I == EarlyExitCond) + VPEarlyExitCond = Recipe; + RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe)) { // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In @@ -9031,6 +9279,22 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPBB->appendRecipe(Recipe); } + // If this is an early exit block we need to do more work to generate the + // actual exit condition. We generate an or reduction of the vector + // condition so that we exit the loop if any lane of the vector would cause + // us to exit. + if (BB == EarlyExitingBB) { + VPValue *ScalarExitCond = getConditionForVectorEarlyExit( + OrigLoop, BB, *Plan, Builder, RecipeBuilder, VPEarlyExitCond); + + // Branch to early exit BB. + auto *NewBR = + new VPInstruction(VPInstruction::BranchOnCond, {ScalarExitCond}); + RecipeBuilder.setRecipe(cast(BB->getTerminator()), NewBR); + VPBB->appendRecipe(NewBR); + + Plan->getVectorLoopRegion()->setEarlyExiting(VPBB); + } VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); VPBB = cast(VPBB->getSingleSuccessor()); } @@ -9049,6 +9313,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); addUsersInExitBlock(*Plan, ExitUsersToFix); + // First-order recurrences are not currently permitted with early exits, so + // there is currently no need to collect a list of early exit users in the + // same way as above. + if (EarlyExitingBB) + addUsersInEarlyExitBlock(EarlyExitingBB, + Legal->getUncountableEarlyExitBlock(), + RecipeBuilder, *Plan); + // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9176,11 +9448,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); VPBasicBlock *MiddleVPBB = - cast(VectorLoopRegion->getSingleSuccessor()); + cast(VectorLoopRegion->getSuccessors()[0]); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) continue; + assert(VectorLoopRegion->getNumSuccessors() == 1 && + "Cannot handle multiple succesors!"); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind Kind = RdxDesc.getRecurrenceKind(); @@ -9645,15 +9919,71 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } -static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, - VectorizationFactor &VF, - std::optional VScale, Loop *L, - ScalarEvolution &SE, - ScalarEpilogueLowering SEL) { +static InstructionCost calculateEarlyExitCost(const TargetTransformInfo *TTI, + LoopVectorizationLegality *Legal, + Loop *L, ElementCount VF) { + unsigned NumCttzElemCalls = 0; + BasicBlock *OrigEarlyExitingBlock = Legal->getUncountableEarlyExitingBlock(); + BasicBlock *OrigLoopLatch = L->getLoopLatch(); + + auto isUsedInEarlyExitBlock = [&L, &OrigEarlyExitingBlock](Value *V, + User *U) -> bool { + auto *UI = cast(U); + if (!L->contains(UI)) { + PHINode *PHI = dyn_cast(UI); + assert(PHI && "Expected LCSSA form"); + int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock); + if (Index != -1 && PHI->getIncomingValue(Index) == V) + return true; + } + return false; + }; + + for (const auto &Entry : Legal->getInductionVars()) { + PHINode *OrigPhi = Entry.first; + Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch); + + for (User *U : PostInc->users()) + if (isUsedInEarlyExitBlock(PostInc, U)) + NumCttzElemCalls++; + + for (User *U : OrigPhi->users()) + if (isUsedInEarlyExitBlock(OrigPhi, U)) + NumCttzElemCalls++; + } + + InstructionCost Cost = 0; + if (NumCttzElemCalls) { + LLVMContext &Context = L->getHeader()->getContext(); + // Ideally we'd query the vplan for the canonical IV type, but we don't + // have a vplan yet so let's assume it's 64-bit. + auto CtzType = IntegerType::getIntNTy(Context, 64); + auto VecI1Type = VectorType::get(IntegerType::getInt1Ty(Context), VF); + + IntrinsicCostAttributes Attrs( + Intrinsic::experimental_cttz_elts, CtzType, + {PoisonValue::get(VecI1Type), ConstantInt::getTrue(Context)}); + Cost = TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_RecipThroughput); + Cost *= NumCttzElemCalls; + } + return Cost; +} + +static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, + std::optional VScale, Loop *L, + ScalarEvolution &SE, + ScalarEpilogueLowering SEL, + InstructionCost EarlyExitCost) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) return false; + // Add on the cost of work required in the vector early exit block, if one + // exists. + if (EarlyExitCost.isValid()) + CheckCost += EarlyExitCost; + // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. if (VF.Width.isScalar()) { @@ -9802,7 +10132,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasUncountableEarlyExit()) { + if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { reportVectorizationFailure("Auto-vectorization of loops with uncountable " "early exit is not yet supported", "Auto-vectorization of loops with uncountable " @@ -9950,12 +10280,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (VF.Width.isVector() || SelectedIC > 1) Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + InstructionCost EarlyExitCost = InstructionCost::getInvalid(); + if (VF.Width.isVector() && LVL.hasUncountableEarlyExit()) + EarlyExitCost = calculateEarlyExitCost(TTI, &LVL, L, VF.Width); + // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE(), SEL)) { + !isOutsideLoopWorkProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, + *PSE.getSE(), SEL, EarlyExitCost)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e3a6388094940..98de6ff28637e3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -419,7 +419,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); + auto *VPRB = dyn_cast(PredVPBlock); + + // The exiting block that leads to this block might be an early exit from + // a loop region. + VPBasicBlock *PredVPBB = VPRB && VPRB->getEarlyExit() == this + ? cast(VPRB->getEarlyExiting()) + : PredVPBlock->getExitingBasicBlock(); + auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; @@ -441,6 +448,11 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // Set each forward successor here when it is created, excluding // backedges. A backward successor is set when the branch is created. unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; + VPRegionBlock *PredParentRegion = + dyn_cast_or_null(PredVPBB->getParent()); + if (PredParentRegion->getEarlyExiting() == PredVPBB) { + idx = 1 - idx; + } assert(!TermBr->getSuccessor(idx) && "Trying to reset an existing successor block."); TermBr->setSuccessor(idx, NewBB); @@ -497,6 +509,7 @@ void VPBasicBlock::execute(VPTransformState *State) { !((SingleHPred = getSingleHierarchicalPredecessor()) && SingleHPred->getExitingBasicBlock() == PrevVPBB && PrevVPBB->getSingleHierarchicalSuccessor() && + PrevVPBB != getEnclosingLoopRegion()->getEarlyExiting() && (SingleHPred->getParent() == getEnclosingLoopRegion() && !IsLoopRegion(SingleHPred))) && /* B */ !(Replica && getPredecessors().empty())) { /* C */ @@ -515,7 +528,8 @@ void VPBasicBlock::execute(VPTransformState *State) { UnreachableInst *Terminator = State->Builder.CreateUnreachable(); // Register NewBB in its loop. In innermost loops its the same for all // BB's. - if (State->CurrentVectorLoop) + if (State->CurrentVectorLoop && + this != getEnclosingLoopRegion()->getEarlyExit()) State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); State->CFG.PrevBB = NewBB; @@ -633,7 +647,11 @@ const VPRecipeBase *VPBasicBlock::getTerminator() const { } bool VPBasicBlock::isExiting() const { - return getParent() && getParent()->getExitingBasicBlock() == this; + const VPRegionBlock *VPRB = getParent(); + if (!VPRB) + return false; + return VPRB->getExitingBasicBlock() == this || + VPRB->getEarlyExiting() == this; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -874,7 +892,8 @@ VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) { VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, - bool TailFolded, Loop *TheLoop) { + bool TailFolded, Loop *TheLoop, + bool HasEarlyExit) { VPIRBasicBlock *Entry = VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader()); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); @@ -887,8 +906,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // uncountable exits whilst also ensuring the symbolic maximum and known // back-edge taken count remain identical for loops with countable exits. const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); - assert((!isa(BackedgeTakenCountSCEV) && - BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) && + assert(!isa(BackedgeTakenCountSCEV) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, @@ -908,6 +926,12 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); + if (HasEarlyExit) { + VPBasicBlock *EarlyExitVPBB = new VPBasicBlock("vector.early.exit"); + TopRegion->setEarlyExit(EarlyExitVPBB); + VPBlockUtils::connectBlocks(TopRegion, EarlyExitVPBB); + } + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); if (!RequiresScalarEpilogueCheck) { VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -922,7 +946,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // 2) If we require a scalar epilogue, there is no conditional branch as // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. - BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); + BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); @@ -998,7 +1022,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. -static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { +static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, + BasicBlock *IRBB) { VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB); for (auto &R : make_early_inc_range(*VPBB)) { assert(!R.isPhi() && "Tried to move phi recipe to end of block"); @@ -1012,6 +1037,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { VPBlockUtils::disconnectBlocks(VPBB, Succ); } delete VPBB; + return IRVPBB; } /// Generate the code inside the preheader and body of the vectorized loop. @@ -1035,7 +1061,7 @@ void VPlan::execute(VPTransformState *State) { // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); + cast(getVectorLoopRegion()->getSuccessors()[0]); // Find the VPBB for the scalar preheader, relying on the current structure // when creating the middle block and its successrs: if there's a single // predecessor, it must be the scalar preheader. Otherwise, the second @@ -1049,7 +1075,14 @@ void VPlan::execute(VPTransformState *State) { assert(!isa(ScalarPhVPBB) && "scalar preheader cannot be wrapped already"); replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh); - replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); + MiddleVPBB = replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); + + // Ensure the middle block is still the first successor. + for (auto *Succ : getVectorLoopRegion()->getSuccessors()) + if (Succ == MiddleVPBB) { + getVectorLoopRegion()->moveSuccessorToFront(MiddleVPBB); + break; + } // Disconnect the middle block from its single successor (the scalar loop // header) in both the CFG and DT. The branch will be recreated during VPlan @@ -1110,6 +1143,20 @@ void VPlan::execute(VPTransformState *State) { cast(Phi)->addIncoming(Val, VectorLatchBB); } + // Patch up early exiting vector block to jump to the original scalar loop's + // early exit block. + if (getVectorLoopRegion()->getEarlyExit()) { + VPBasicBlock *EarlyExitVPBB = + cast(getVectorLoopRegion()->getEarlyExit()); + BasicBlock *VectorEarlyExitBB = State->CFG.VPBB2IRBB[EarlyExitVPBB]; + BasicBlock *OrigEarlyExitBB = State->CFG.EarlyExitBB; + BranchInst *BI = BranchInst::Create(OrigEarlyExitBB); + BI->insertBefore(VectorEarlyExitBB->getTerminator()); + VectorEarlyExitBB->getTerminator()->eraseFromParent(); + State->CFG.DTU.applyUpdates( + {{DominatorTree::Insert, VectorEarlyExitBB, OrigEarlyExitBB}}); + } + State->CFG.DTU.flush(); assert(State->CFG.DTU.getDomTree().verify( DominatorTree::VerificationLevel::Fast) && @@ -1218,9 +1265,10 @@ LLVM_DUMP_METHOD void VPlan::dump() const { print(dbgs()); } #endif -void VPlan::addLiveOut(PHINode *PN, VPValue *V) { - assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists"); - LiveOuts.insert({PN, new VPLiveOut(PN, V)}); +void VPlan::addLiveOut(PHINode *PN, VPValue *V, VPBasicBlock *IncomingBlock) { + auto Key = std::pair(PN, IncomingBlock); + assert(LiveOuts.count(Key) == 0 && "an exit value for PN already exists"); + LiveOuts.insert({Key, new VPLiveOut(PN, V, IncomingBlock)}); } static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, @@ -1291,8 +1339,9 @@ VPlan *VPlan::duplicate() { remapOperands(Entry, NewEntry, Old2NewVPValues); // Clone live-outs. - for (const auto &[_, LO] : LiveOuts) - NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]); + for (const auto &[Key, LO] : LiveOuts) + NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)], + Key.second); // Initialize remaining fields of cloned VPlan. NewPlan->VFs = VFs; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 8c5246d613c13d..7bf1dac06a8911 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -345,6 +345,11 @@ struct VPTransformState { /// vector loop. BasicBlock *ExitBB = nullptr; + /// We need to keep track of the early exit block from the original scalar + /// loop in order to update the dominator tree correctly, since the vector + /// early exit will also jump to the original. + BasicBlock *EarlyExitBB = nullptr; + /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case /// of replication, maps the BasicBlock of the last replica created. SmallDenseMap VPBB2IRBB; @@ -617,6 +622,17 @@ class VPBlockBase { return true; } + void moveSuccessorToFront(VPBlockBase *Succ) { + if (Successors[0] == Succ) + return; + + removeSuccessor(Succ); + + VPBlockBase *Old = Successors[0]; + Successors[0] = Succ; + appendSuccessor(Old); + } + /// Replace all operands of VPUsers in the block with \p NewValue and also /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; @@ -662,10 +678,12 @@ class VPBlockBase { /// used. class VPLiveOut : public VPUser { PHINode *Phi; + VPBasicBlock *IncomingVPBB; public: - VPLiveOut(PHINode *Phi, VPValue *Op) - : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} + VPLiveOut(PHINode *Phi, VPValue *Op, VPBasicBlock *IncomingVPBB = nullptr) + : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi), + IncomingVPBB(IncomingVPBB) {} static inline bool classof(const VPUser *U) { return U->getVPUserID() == VPUser::VPUserID::LiveOut; @@ -1244,6 +1262,8 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + ExtractHighestActive, + OrReduction, }; private: @@ -3343,7 +3363,12 @@ class VPIRBasicBlock : public VPBasicBlock { }; /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks -/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG. +/// which form a Single-Entry-Single-Exiting or Single-Entry-Multiple-Exiting +/// subgraph of the output IR CFG. For the multiple-exiting case only a total +/// of two exits are currently supported and the early exit is tracked +/// separately. The first successor should always correspond to the normal +/// exiting block, i.e. vector latch -> middle.block. An optional second +/// successor corresponds to the early exit. /// A VPRegionBlock may indicate that its contents are to be replicated several /// times. This is designed to support predicated scalarization, in which a /// scalar if-then code structure needs to be generated VF * UF times. Having @@ -3351,13 +3376,25 @@ class VPIRBasicBlock : public VPBasicBlock { /// candidate VF's. The actual replication takes place only once the desired VF /// and UF have been determined. class VPRegionBlock : public VPBlockBase { - /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. + /// Hold the Single Entry of the SESE/SEME region modelled by the + /// VPRegionBlock. VPBlockBase *Entry; - /// Hold the Single Exiting block of the SESE region modelled by the + /// Hold the normal Exiting block of the SESE/SEME region modelled by the /// VPRegionBlock. VPBlockBase *Exiting; + /// Hold the Early Exiting block of the SEME region. If this is a SESE region + /// this value should be nullptr. + VPBlockBase *EarlyExiting; + + /// Hold the Early Exit block of the SEME region, if one exists. + VPBasicBlock *EarlyExit; + + /// If one exists, this keeps track of the vector early mask that triggered + /// the early exit. + VPValue *VectorEarlyExitCond; + /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. bool IsReplicator; @@ -3366,6 +3403,7 @@ class VPRegionBlock : public VPBlockBase { VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name = "", bool IsReplicator = false) : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), + EarlyExiting(nullptr), EarlyExit(nullptr), VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) { assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); assert(Exiting->getSuccessors().empty() && "Exit block has successors."); @@ -3374,6 +3412,7 @@ class VPRegionBlock : public VPBlockBase { } VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), + EarlyExiting(nullptr), EarlyExit(nullptr), VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) {} ~VPRegionBlock() override { @@ -3392,6 +3431,12 @@ class VPRegionBlock : public VPBlockBase { const VPBlockBase *getEntry() const { return Entry; } VPBlockBase *getEntry() { return Entry; } + /// Sets the early exit vector mask. + void setVectorEarlyExitCond(VPValue *V) { VectorEarlyExitCond = V; } + + /// Gets the early exit vector mask + VPValue *getVectorEarlyExitCond() const { return VectorEarlyExitCond; } + /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p /// EntryBlock must have no predecessors. void setEntry(VPBlockBase *EntryBlock) { @@ -3404,8 +3449,8 @@ class VPRegionBlock : public VPBlockBase { const VPBlockBase *getExiting() const { return Exiting; } VPBlockBase *getExiting() { return Exiting; } - /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p - /// ExitingBlock must have no successors. + /// Set \p ExitingBlock as the normal exiting VPBlockBase of this + /// VPRegionBlock. \p ExitingBlock must have no successors. void setExiting(VPBlockBase *ExitingBlock) { assert(ExitingBlock->getSuccessors().empty() && "Exit block cannot have successors."); @@ -3413,6 +3458,29 @@ class VPRegionBlock : public VPBlockBase { ExitingBlock->setParent(this); } + /// Set \p EarlyExitingBlock as the early exiting VPBlockBase of this + /// VPRegionBlock. \p EarlyExitingBlock must have a successor, since + /// it cannot be the latch. + void setEarlyExiting(VPBlockBase *EarlyExitingBlock) { + assert(EarlyExitingBlock->getNumSuccessors() == 1 && + "Early exit block must have a successor."); + assert(EarlyExitingBlock->getParent() == this && + "Early exit block should already be in loop region"); + EarlyExiting = EarlyExitingBlock; + } + + const VPBlockBase *getEarlyExiting() const { return EarlyExiting; } + VPBlockBase *getEarlyExiting() { return EarlyExiting; } + + void setEarlyExit(VPBasicBlock *ExitBlock) { EarlyExit = ExitBlock; } + + const VPBasicBlock *getEarlyExit() const { return EarlyExit; } + VPBasicBlock *getEarlyExit() { return EarlyExit; } + + /// Return the number of exiting blocks from this region. It should match + /// the number of successors. + unsigned getNumExitingBlocks() const { return EarlyExiting ? 2 : 1; } + /// Returns the pre-header VPBasicBlock of the loop region. VPBasicBlock *getPreheaderVPBB() { assert(!isReplicator() && "should only get pre-header of loop regions"); @@ -3505,7 +3573,7 @@ class VPlan { /// Values used outside the plan. It contains live-outs that need fixing. Any /// live-out that is fixed outside VPlan needs to be removed. The remaining /// live-outs are fixed via VPLiveOut::fixPhi. - MapVector LiveOuts; + MapVector, VPLiveOut *> LiveOuts; /// Mapping from SCEVs to the VPValues representing their expansions. /// NOTE: This mapping is temporary and will be removed once all users have @@ -3549,7 +3617,8 @@ class VPlan { static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, - bool TailFolded, Loop *TheLoop); + bool TailFolded, Loop *TheLoop, + bool HasEarlyExit = false); /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, @@ -3686,12 +3755,20 @@ class VPlan { return cast(&*EntryVPBB->begin()); } - void addLiveOut(PHINode *PN, VPValue *V); + void addLiveOut(PHINode *PN, VPValue *V, + VPBasicBlock *IncomingBlock = nullptr); - const MapVector &getLiveOuts() const { + const MapVector, VPLiveOut *> & + getLiveOuts() const { return LiveOuts; } + void removeLiveOut(PHINode *PN, VPBasicBlock *IncomingBlock = nullptr) { + auto Key = std::pair(PN, IncomingBlock); + delete LiveOuts[Key]; + LiveOuts.erase(Key); + } + VPValue *getSCEVExpansion(const SCEV *S) const { return SCEVToExpansion.lookup(S); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9b1294f2c42822..1a9e8f41f79913 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -60,8 +60,10 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractFromEnd: + case VPInstruction::ExtractHighestActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: + case VPInstruction::OrReduction: case VPInstruction::PtrAdd: return false; default: @@ -204,15 +206,20 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); - auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; - // Values leaving the vector loop reach live out phi's in the exiting block - // via middle block. - auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() - ? MiddleVPBB - : ExitingVPBB; + VPBasicBlock *PredVPBB = nullptr; + if (IncomingVPBB) + PredVPBB = IncomingVPBB; + else { + VPRecipeBase *ExitRecipe = ExitValue->getDefiningRecipe(); + auto *ExitVPBB = ExitRecipe ? ExitRecipe->getParent() : nullptr; + VPBasicBlock *MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSuccessors()[0]); + // Values leaving the vector loop reach live out phi's in the exiting block + // via middle block. + PredVPBB = + !ExitVPBB || ExitVPBB->getEnclosingLoopRegion() ? MiddleVPBB : ExitVPBB; + } + BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; Value *V = State.get(ExitValue, VPLane(0)); if (Phi->getBasicBlockIndex(PredBB) != -1) @@ -386,6 +393,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::BranchOnCount: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::OrReduction: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: return true; @@ -521,6 +529,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return CondBr; VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion->getEarlyExiting() == getParent()) + return CondBr; + VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); return CondBr; @@ -613,6 +624,13 @@ Value *VPInstruction::generate(VPTransformState &State) { return ReducedPartRdx; } + case VPInstruction::ExtractHighestActive: { + Value *Vec = State.get(getOperand(0)); + Value *Mask = State.get(getOperand(1)); + Value *Ctz = + Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask); + return Builder.CreateExtractElement(Vec, Ctz); + } case VPInstruction::ExtractFromEnd: { auto *CI = cast(getOperand(1)->getLiveInIRValue()); unsigned Offset = CI->getZExtValue(); @@ -662,7 +680,10 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } - + case VPInstruction::OrReduction: { + Value *Val = State.get(getOperand(0)); + return Builder.CreateOrReduce(Val); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -670,7 +691,9 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ComputeReductionResult; + getOpcode() == VPInstruction::ExtractHighestActive || + getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::OrReduction; } bool VPInstruction::isSingleScalar() const { @@ -818,6 +841,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractFromEnd: O << "extract-from-end"; break; + case VPInstruction::ExtractHighestActive: + O << "extract-highest-active"; + break; case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; @@ -827,6 +853,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::OrReduction: + O << "or reduction"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9ca14fc7812138..aadb9942bab4c4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -382,7 +382,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { continue; auto *PredVPBB = dyn_cast_or_null(VPBB->getSinglePredecessor()); - if (!PredVPBB || PredVPBB->getNumSuccessors() != 1) + if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 || + PredVPBB == Plan.getVectorLoopRegion()->getEarlyExiting()) continue; WorkList.push_back(VPBB); } @@ -399,6 +400,8 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { VPBlockUtils::disconnectBlocks(VPBB, Succ); VPBlockUtils::connectBlocks(PredVPBB, Succ); } + if (ParentRegion && ParentRegion->getEarlyExiting() == VPBB) + ParentRegion->setEarlyExiting(PredVPBB); delete VPBB; } return !WorkList.empty(); @@ -851,7 +854,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { /// Try to simplify recipe \p R. static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { using namespace llvm::VPlanPatternMatch; - if (auto *Blend = dyn_cast(&R)) { // Try to remove redundant blend recipes. SmallPtrSet UniqueValues; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 99bc4c38a3c3cd..5d5e2828da53d4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -244,13 +244,28 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } - VPBlockBase *MiddleBB = - IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor(); - if (IRBB != IRBB->getPlan()->getPreheader() && - IRBB->getSinglePredecessor() != MiddleBB) { - errs() << "VPIRBasicBlock can only be used as pre-header or a successor of " - "middle-block at the moment!\n"; - return false; + if (IRBB != IRBB->getPlan()->getPreheader()) { + const SmallVectorImpl &Succs = + IRBB->getPlan()->getVectorLoopRegion()->getSuccessors(); + + // First successor is always the middle block, and the middle block's first + // successor is always the exit block. + unsigned NumMatches = 0; + if (Succs[0]->getSuccessors()[0] == IRBB) { + NumMatches++; + } + + // The remaining successors should be vector early exits. + for (unsigned I = 1; I < Succs.size(); I++) { + if (Succs[I]->getSingleSuccessor() == IRBB) + NumMatches++; + } + if (!NumMatches) { + errs() << "VPIRBasicBlock can only be used as pre-header or an indirect " + "successor of " + "VPRegionBlock at the moment!\n"; + return false; + } } return true; } @@ -269,7 +284,9 @@ static bool hasDuplicates(const SmallVectorImpl &VPBlockVec) { bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { auto *VPBB = dyn_cast(VPB); // Check block's condition bit. - if (VPB->getNumSuccessors() > 1 || + // NOTE: A VPRegionBlock can legally have multiple successors due to + // early exits from the loop. + if ((VPB->getNumSuccessors() > 1 && !isa(VPB)) || (VPBB && VPBB->getParent() && VPBB->isExiting() && !VPBB->getParent()->isReplicator())) { if (!VPBB || !VPBB->getTerminator()) { @@ -277,7 +294,7 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { "have a proper branch recipe!\n"; return false; } - } else { + } else if (!isa(VPB)) { if (VPBB && VPBB->getTerminator()) { errs() << "Unexpected branch recipe!\n"; return false; @@ -293,6 +310,26 @@ bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { return false; } + // If this is a loop region with multiple successors it must have as many + // exiting blocks as successors, even if the original scalar loop only had a + // single exit block. That's because in the vector loop we always create a + // middle block for the vector latch exit, which is distinct from the early + // exit. + auto *VPRB = dyn_cast(VPB); + if (VPRB && VPRB->getNumExitingBlocks() != VPB->getNumSuccessors()) { + errs() << "Number of exiting blocks (" << VPRB->getNumExitingBlocks() + << ") does not match number of successors (" + << VPB->getNumSuccessors() << ")!\n"; + return false; + } + + if (auto *VPRB = dyn_cast(VPB)) { + if (VPRB->getNumExitingBlocks() != VPB->getNumSuccessors()) { + errs() << "Not enough exiting blocks for successors!\n"; + return false; + } + } + for (const VPBlockBase *Succ : Successors) { // There must be a bi-directional link between block and successor. const auto &SuccPreds = Succ->getPredecessors(); diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll index 49454ae18db79d..4657fa32415752 100644 --- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll @@ -1,7 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; REQUIRES: asserts -; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -debug-only=loop-vectorize \ +; RUN: 2>%t | FileCheck %s --check-prefixes=CHECK,MAY_FAULT ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -vectorizer-no-mem-fault \ +; RUN: | FileCheck %s --check-prefixes=CHECK,NO_FAULT declare void @init_mem(ptr, i64); @@ -9,29 +12,62 @@ define i64 @same_exit_block_pre_inc_use1() { ; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' ; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] -; CHECK-NEXT: ret i64 [[START_0_LCSSA]] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [1024 x i8] @@ -67,21 +103,55 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -118,21 +188,55 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { ; CHECK-NEXT: [[P2:%.*]] = alloca [40 x i32], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LAND_RHS:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_INC:%.*]], label [[LAND_RHS]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[FOR_INC]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ] +; CHECK-NEXT: [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[START_0_LCSSA]] ; entry: @@ -169,21 +273,50 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> , i64 [[TMP14]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -219,21 +352,58 @@ define i64 @same_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE5:%.*]] = add i64 3, [[TMP27]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE5]], [[VECTOR_EARLY_EXIT]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] ; entry: @@ -271,19 +441,19 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX2]], [[LD1]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -319,21 +489,55 @@ define i64 @same_exit_block_post_inc_use() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[INDEX_NEXT1]], [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 67, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -369,21 +573,57 @@ define i64 @same_exit_block_post_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) +; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], [[INDEX1]] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 1 +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP21]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP1]] ], [ [[INDEX2]], [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -419,21 +659,50 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP12]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> zeroinitializer, i64 [[TMP14]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP1]] ], [ 1, [[LOOP_INC1]] ], [ 1, [[LOOP_INC]] ], [ [[TMP16]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -470,24 +739,58 @@ define i64 @diff_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -527,24 +830,53 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> , i64 [[TMP14]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[TMP16]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -584,24 +916,58 @@ define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC4]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]] +; CHECK: loop.inc4: +; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX2]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX1]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX1]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] ; CHECK: loop.end: -; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX1]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA1]] ; entry: @@ -640,19 +1006,46 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]]) +; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: loop.early.exit: ; CHECK-NEXT: ret i64 0 ; CHECK: loop.end: @@ -694,24 +1087,58 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP25]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP_INC1]] ], [ 67, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -752,24 +1179,60 @@ define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) +; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], [[INDEX1]] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 1 +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP21]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT1]], [[LOOP1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC1]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -867,24 +1330,57 @@ define i64 @multiple_exits_one_early() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 60 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 63, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64 -; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[INDEX2]], 64 +; CHECK-NEXT: br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END]] ; CHECK: search: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC1]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 128 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP1]] ], [ [[INDEX2]], [[SEARCH]] ], [ 128, [[LOOP_INC1]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1040,22 +1536,57 @@ define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; CHECK: loop.inc3: +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP9]], label [[LOOP_INC:%.*]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true) +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], [[INDEX1]] +; CHECK-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] ; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false -; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC1]], label [[LOOP_END]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1096,20 +1627,20 @@ define i64 @loop_contains_safe_call() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX2]] ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) ; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1152,7 +1683,7 @@ define i64 @loop_contains_unsafe_call() { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34 ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: @@ -1199,20 +1730,20 @@ define i64 @loop_contains_safe_div() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP1:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] ; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1346,7 +1877,6 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' ; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit( ; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) { ; CHECK-NEXT: entry: @@ -1460,28 +1990,89 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_reverse() { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; MAY_FAULT-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] +; +; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() { +; NO_FAULT-NEXT: entry: +; NO_FAULT-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 +; NO_FAULT-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 +; NO_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; NO_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO_FAULT: vector.ph: +; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; NO_FAULT: vector.body: +; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[LOOP_INC4:%.*]] ] +; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC4]] ] +; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX1]] +; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 -3 +; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; NO_FAULT-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; NO_FAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3 +; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1 +; NO_FAULT-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> +; NO_FAULT-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[REVERSE]], [[REVERSE3]] +; NO_FAULT-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], +; NO_FAULT-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; NO_FAULT-NEXT: br i1 [[TMP9]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC4]] +; NO_FAULT: loop.inc4: +; NO_FAULT-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX1]], 4 +; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; NO_FAULT-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 1020 +; NO_FAULT-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; NO_FAULT: vector.early.exit: +; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true) +; NO_FAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP11]] +; NO_FAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true) +; NO_FAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[INDEX1]] +; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = sub i64 1023, [[TMP14]] +; NO_FAULT-NEXT: br label [[LOOP_END:%.*]] +; NO_FAULT: middle.block: +; NO_FAULT-NEXT: br i1 false, label [[LOOP_END]], label [[SCALAR_PH]] +; NO_FAULT: scalar.ph: +; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; NO_FAULT-NEXT: br label [[LOOP:%.*]] +; NO_FAULT: loop: +; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; NO_FAULT: loop.inc: +; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 +; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] +; NO_FAULT: loop.end: +; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; NO_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [1024 x i8] @@ -1573,25 +2164,113 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( -; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( +; MAY_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; MAY_FAULT: vector.ph: +; MAY_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; MAY_FAULT: vector.body: +; MAY_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; MAY_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; MAY_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; MAY_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; MAY_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; MAY_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; MAY_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; MAY_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; MAY_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; MAY_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; MAY_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; MAY_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; MAY_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; MAY_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; MAY_FAULT: loop.inc3: +; MAY_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; MAY_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; MAY_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; MAY_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; MAY_FAULT: vector.early.exit: +; MAY_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; MAY_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; MAY_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; MAY_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; MAY_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; MAY_FAULT-NEXT: br label [[LOOP_END:%.*]] +; MAY_FAULT: middle.block: +; MAY_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; MAY_FAULT: scalar.ph: +; MAY_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP35:![0-9]+]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] +; +; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( +; NO_FAULT-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { +; NO_FAULT-NEXT: entry: +; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO_FAULT: vector.ph: +; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; NO_FAULT: vector.body: +; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; NO_FAULT: loop.inc3: +; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; NO_FAULT: vector.early.exit: +; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; NO_FAULT-NEXT: br label [[LOOP_END:%.*]] +; NO_FAULT: middle.block: +; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; NO_FAULT: scalar.ph: +; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; NO_FAULT-NEXT: br label [[LOOP:%.*]] +; NO_FAULT: loop: +; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; NO_FAULT: loop.inc: +; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP37:![0-9]+]] +; NO_FAULT: loop.end: +; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; NO_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: br label %loop @@ -1621,7 +2300,6 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' ; DEBUG: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) ; DEBUG-NEXT: LV: We can vectorize this loop! -; DEBUG-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. ; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check( ; CHECK-SAME: i32 [[END:%.*]]) { ; CHECK-NEXT: entry: @@ -1691,7 +2369,7 @@ declare void @abort() define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { ; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond' ; DEBUG: LV: Found an early exit loop with symbolic max backedge taken count: 275 -; DEBUG: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; DEBUG-NEXT: LV: We can vectorize this loop! ; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond( ; CHECK-SAME: ptr [[S:%.*]]) { ; CHECK-NEXT: entry: @@ -1791,28 +2469,85 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { ; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas' ; DEBUG: LV: Not vectorizing: Loop may fault. -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 +; MAY_FAULT-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] +; +; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; NO_FAULT-NEXT: entry: +; NO_FAULT-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 +; NO_FAULT-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 +; NO_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; NO_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO_FAULT: vector.ph: +; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; NO_FAULT: vector.body: +; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; NO_FAULT: loop.inc3: +; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; NO_FAULT: vector.early.exit: +; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; NO_FAULT-NEXT: br label [[LOOP_END:%.*]] +; NO_FAULT: middle.block: +; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; NO_FAULT: scalar.ph: +; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; NO_FAULT-NEXT: br label [[LOOP:%.*]] +; NO_FAULT: loop: +; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; NO_FAULT: loop.inc: +; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP39:![0-9]+]] +; NO_FAULT: loop.end: +; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; NO_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [42 x i8] @@ -1842,25 +2577,79 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( -; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( +; MAY_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] +; +; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( +; NO_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { +; NO_FAULT-NEXT: entry: +; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO_FAULT: vector.ph: +; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; NO_FAULT: vector.body: +; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; NO_FAULT: loop.inc3: +; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; NO_FAULT: vector.early.exit: +; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; NO_FAULT-NEXT: br label [[LOOP_END:%.*]] +; NO_FAULT: middle.block: +; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; NO_FAULT: scalar.ph: +; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; NO_FAULT-NEXT: br label [[LOOP:%.*]] +; NO_FAULT: loop: +; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; NO_FAULT: loop.inc: +; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP41:![0-9]+]] +; NO_FAULT: loop.end: +; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; NO_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: br label %loop @@ -1886,25 +2675,79 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( -; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( +; MAY_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] +; +; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( +; NO_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; NO_FAULT-NEXT: entry: +; NO_FAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO_FAULT: vector.ph: +; NO_FAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; NO_FAULT: vector.body: +; NO_FAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ] +; NO_FAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_INC3]] ] +; NO_FAULT-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; NO_FAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; NO_FAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; NO_FAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; NO_FAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; NO_FAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; NO_FAULT-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; NO_FAULT-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], +; NO_FAULT-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; NO_FAULT-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]] +; NO_FAULT: loop.inc3: +; NO_FAULT-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; NO_FAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; NO_FAULT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; NO_FAULT-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; NO_FAULT: vector.early.exit: +; NO_FAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[TMP9]] +; NO_FAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; NO_FAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[INDEX1]] +; NO_FAULT-NEXT: [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP12]] +; NO_FAULT-NEXT: br label [[LOOP_END:%.*]] +; NO_FAULT: middle.block: +; NO_FAULT-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; NO_FAULT: scalar.ph: +; NO_FAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; NO_FAULT-NEXT: br label [[LOOP:%.*]] +; NO_FAULT: loop: +; NO_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; NO_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; NO_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; NO_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; NO_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] +; NO_FAULT: loop.inc: +; NO_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; NO_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; NO_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP43:![0-9]+]] +; NO_FAULT: loop.end: +; NO_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ] +; NO_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: br label %loop @@ -1934,3 +2777,87 @@ declare i32 @foo(i32) readonly declare @foo_vec() attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } + +;. +; MAY_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; MAY_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; MAY_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; MAY_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; MAY_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; MAY_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]} +;. +; NO_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]} +; NO_FAULT: [[LOOP42]] = distinct !{[[LOOP42]], [[META1]], [[META2]]} +; NO_FAULT: [[LOOP43]] = distinct !{[[LOOP43]], [[META2]], [[META1]]} +;. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 9958d6ea124f81..28ae078d5cb987 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -8,6 +8,7 @@ #include "../lib/Transforms/Vectorize/VPlanVerifier.h" #include "../lib/Transforms/Vectorize/VPlan.h" +#include "../lib/Transforms/Vectorize/VPlanTransforms.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "gtest/gtest.h" @@ -28,6 +29,10 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { VPBasicBlock *VPBB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); + + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + VPlan Plan(VPPH, &*TC, VPBB1); #if GTEST_HAS_STREAM_REDIRECTION @@ -57,6 +62,10 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPBB2->appendRecipe(BranchOnCond); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); + + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + VPBlockUtils::connectBlocks(VPBB1, R1); auto TC = std::make_unique(); @@ -102,6 +111,9 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + auto TC = std::make_unique(); VPlan Plan(VPPH, &*TC, VPBB1); @@ -138,6 +150,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(VPBB1, R1); + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + auto TC = std::make_unique(); VPlan Plan(VPPH, &*TC, VPBB1); @@ -175,6 +190,9 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB3->setParent(R1); + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + auto TC = std::make_unique(); VPlan Plan(VPPH, &*TC, VPBB1); @@ -204,6 +222,9 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { VPBlockUtils::connectBlocks(VPBB1, R1); VPBB1->setParent(R1); + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + auto TC = std::make_unique(); VPlan Plan(VPPH, &*TC, VPBB1); @@ -217,4 +238,120 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { #endif } +TEST(VPVerifierTest, LoopRegionMultipleSuccessors1) { + VPInstruction *TC = new VPInstruction(Instruction::Add, {}); + VPBasicBlock *VPBBPH = new VPBasicBlock("preheader"); + VPBBPH->appendRecipe(TC); + + VPInstruction *TC2 = new VPInstruction(Instruction::Add, {}); + VPBasicBlock *VPBBENTRY = new VPBasicBlock("entry"); + VPBBENTRY->appendRecipe(TC2); + + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(TC2, {}); + VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); + VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); + VPInstruction *I3 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); + + VPBasicBlock *RBB1 = new VPBasicBlock(); + RBB1->appendRecipe(CanonicalIVPHI); + RBB1->appendRecipe(I1); + RBB1->appendRecipe(I2); + RBB1->appendRecipe(I3); + RBB1->setName("bb1"); + + VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); + VPInstruction *I5 = new VPInstruction(VPInstruction::BranchOnCond, {I4}); + VPBasicBlock *RBB2 = new VPBasicBlock(); + RBB2->appendRecipe(I4); + RBB2->appendRecipe(I5); + RBB2->setName("bb2"); + + VPRegionBlock *R1 = new VPRegionBlock(RBB1, RBB2, "R1"); + VPBlockUtils::connectBlocks(RBB1, RBB2); + VPBlockUtils::connectBlocks(VPBBENTRY, R1); + + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBasicBlock *VPEARLYEXIT = new VPBasicBlock("early.exit"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + VPBlockUtils::connectBlocks(R1, VPEARLYEXIT); + + VPlan Plan(VPBBPH, TC, VPBBENTRY); + Plan.setName("TestPlan"); + Plan.addVF(ElementCount::getFixed(4)); + Plan.getVectorLoopRegion()->setExiting(RBB2); + Plan.getVectorLoopRegion()->setEarlyExiting(RBB1); + Plan.getVectorLoopRegion()->setEarlyExit(VPEARLYEXIT); + + EXPECT_TRUE(verifyVPlanIsValid(Plan)); +} + +TEST(VPVerifierTest, LoopRegionMultipleSuccessors2) { + VPInstruction *TC = new VPInstruction(Instruction::Add, {}); + VPBasicBlock *VPBBPH = new VPBasicBlock("preheader"); + VPBBPH->appendRecipe(TC); + + VPInstruction *TC2 = new VPInstruction(Instruction::Add, {}); + VPBasicBlock *VPBBENTRY = new VPBasicBlock("entry"); + VPBBENTRY->appendRecipe(TC2); + + // We can't create a live-in without a VPlan, but we can't create + // a VPlan without the blocks. So we initialize this to a silly + // value here, then fix it up later. + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(TC2, {}); + VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); + VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); + VPInstruction *I3 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); + + VPBasicBlock *RBB1 = new VPBasicBlock(); + RBB1->appendRecipe(CanonicalIVPHI); + RBB1->appendRecipe(I1); + RBB1->appendRecipe(I2); + RBB1->appendRecipe(I3); + RBB1->setName("vector.body"); + + // This really is what the vplan cfg looks like before optimising! + VPBasicBlock *RBB2 = new VPBasicBlock(); + RBB2->setName("loop.inc"); + // A block that inherits the latch name from the original scalar loop. + + VPBasicBlock *RBB3 = new VPBasicBlock(); + // No name + + VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); + VPInstruction *I5 = new VPInstruction(VPInstruction::BranchOnCond, {I4}); + VPBasicBlock *RBB4 = new VPBasicBlock(); + RBB4->appendRecipe(I4); + RBB4->appendRecipe(I5); + RBB4->setName("vector.latch"); + + VPRegionBlock *R1 = new VPRegionBlock(RBB1, RBB4, "R1"); + VPBlockUtils::insertBlockAfter(RBB2, RBB1); + VPBlockUtils::insertBlockAfter(RBB3, RBB2); + VPBlockUtils::insertBlockAfter(RBB4, RBB3); + VPBlockUtils::connectBlocks(VPBBENTRY, R1); + + VPBasicBlock *VPMIDDLE = new VPBasicBlock("middle.block"); + VPBasicBlock *VPEARLYEXIT = new VPBasicBlock("early.exit"); + VPBlockUtils::connectBlocks(R1, VPMIDDLE); + VPBlockUtils::connectBlocks(R1, VPEARLYEXIT); + + VPlan Plan(VPBBPH, TC, VPBBENTRY); + Plan.setName("TestPlan"); + Plan.addVF(ElementCount::getFixed(4)); + Plan.getVectorLoopRegion()->setExiting(RBB4); + Plan.getVectorLoopRegion()->setEarlyExiting(RBB1); + Plan.getVectorLoopRegion()->setEarlyExit(VPEARLYEXIT); + + // Update the VPCanonicalIVPHIRecipe to have a live-in IR value. + LLVMContext C; + IntegerType *Int32 = IntegerType::get(C, 32); + Value *StartIV = PoisonValue::get(Int32); + CanonicalIVPHI->setStartValue(Plan.getOrAddLiveIn(StartIV)); + + EXPECT_TRUE(verifyVPlanIsValid(Plan)); + + VPlanTransforms::optimize(Plan); +} + } // namespace