diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index a4462e53edda09..f134a6df94a69a 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -399,6 +399,19 @@ small trip counts. .. image:: epilogue-vectorization-cfg.png +Early Exit Vectorization +^^^^^^^^^^^^^^^^^^^^^^^^ + +When vectorizing a loop with a single early exit, the loop blocks following the +early exit are predicated and the vector loop will always exit via the latch. +If the early exit has been taken, the vector loop's successor block +(``middle.split`` below) branches to the early exit block. Otherwise +``middle.block`` selects between the exit block from the latch or the scalar +remainder loop. + +.. image:: vplan-early-exit.png + + Performance ----------- diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index dc7e484a40a452..fbe80eddbae07a 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -395,6 +395,11 @@ class LoopVectorizationLegality { /// Returns the uncountable early exiting block. BasicBlock *getUncountableEarlyExitingBlock() const { + if (!HasUncountableEarlyExit) { + assert(getUncountableExitingBlocks().empty() && + "Expected no uncountable exiting blocks"); + return nullptr; + } assert(getUncountableExitingBlocks().size() == 1 && "Expected only a single uncountable exiting block"); return getUncountableExitingBlocks()[0]; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f1568781252c06..555c8435dd330d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1375,6 +1375,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence( } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { + // When vectorizing early exits, create predicates for the latch block only. + // The early exiting block must be a direct predecessor of the latch at the + // moment. + BasicBlock *Latch = TheLoop->getLoopLatch(); + if (hasUncountableEarlyExit()) { + assert( + is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) && + "Uncountable exiting block must be a direct predecessor of latch"); + return BB == Latch; + } return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -1788,13 +1798,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { HasUncountableEarlyExit = false; if (isa(PSE.getBackedgeTakenCount())) { + HasUncountableEarlyExit = true; if (!isVectorizableEarlyExitLoop()) { + UncountableExitingBlocks.clear(); + HasUncountableEarlyExit = false; if (DoExtraAnalysis) Result = false; else return false; - } else - HasUncountableEarlyExit = true; + } } // Go over each instruction and look at memory deps. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index de164ee434d647..ed00c844285c62 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -385,6 +385,11 @@ static cl::opt UseWiderVFIfCallVariantsPresent( cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants")); +static cl::opt EnableEarlyExitVectorization( + "enable-early-exit-vectorization", cl::init(false), cl::Hidden, + cl::desc( + "Enable vectorization of early exit loops with uncountable exits.")); + // Likelyhood of bypassing the vectorized loop because assumptions about SCEV // variables not overflowing do not hold. See `emitSCEVChecks`. static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; @@ -1382,9 +1387,10 @@ class LoopVectorizationCostModel { LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); return false; } - // If we might exit from anywhere but the latch, must run the exiting - // iteration in scalar form. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // If we might exit from anywhere but the latch and early exit vectorization + // is disabled, we must run the exiting iteration in scalar form. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && + !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " "from latch block\n"); return true; @@ -3656,10 +3662,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Start with the conditional branches exiting the loop. If the branch // condition is an instruction contained in the loop that is only used by the - // branch, it is uniform. + // branch, it is uniform. Note conditions from uncountable early exits are not + // uniform. SmallVector Exiting; TheLoop->getExitingBlocks(Exiting); for (BasicBlock *E : Exiting) { + if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) + continue; auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) AddToWorklistIfAllowed(Cmp); @@ -8239,8 +8248,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { // If source is an exiting block, we know the exit edge is dynamically dead // in the vector loop, and thus we don't need to restrict the mask. Avoid - // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) + // adding uses of an otherwise potentially dead instruction unless we are + // vectorizing a loop with uncountable exits. In that case, we always + // materialize the mask. + if (OrigLoop->isLoopExiting(Src) && + Src != Legal->getUncountableEarlyExitingBlock()) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); @@ -8931,14 +8943,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { static SetVector collectUsersInExitBlocks( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { + auto *MiddleVPBB = Plan.getMiddleBlock(); SetVector ExitUsersToFix; for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { - BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock(); - BasicBlock *ExitingBB = find_singleton( - to_vector(predecessors(ExitBB)), - [OrigLoop](BasicBlock *Pred, bool AllowRepeats) { - return OrigLoop->contains(Pred) ? Pred : nullptr; - }); for (VPRecipeBase &R : *ExitVPBB) { auto *ExitIRI = dyn_cast(&R); if (!ExitIRI) @@ -8946,35 +8953,48 @@ static SetVector collectUsersInExitBlocks( auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); if (!ExitPhi) break; - Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); - VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); - // Exit values for inductions are computed and updated outside of VPlan - // and independent of induction recipes. - // TODO: Compute induction exit values in VPlan. - if ((isa(V) && - !cast(V)->getTruncInst()) || - isa(V) || - (isa(IncomingValue) && - OrigLoop->contains(cast(IncomingValue)) && - any_of(IncomingValue->users(), [&Inductions](User *U) { - auto *P = dyn_cast(U); - return P && Inductions.contains(P); - }))) - continue; - ExitUsersToFix.insert(ExitIRI); - ExitIRI->addOperand(V); + for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { + BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); + if (PredVPBB != MiddleVPBB) { + SmallVector ExitingBlocks; + OrigLoop->getExitingBlocks(ExitingBlocks); + assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); + ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] + : ExitingBlocks[0]; + } + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); + VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); + // Exit values for inductions are computed and updated outside of VPlan + // and independent of induction recipes. + // TODO: Compute induction exit values in VPlan. + if ((isa(V) && + !cast(V)->getTruncInst()) || + isa(V) || + (isa(IncomingValue) && + OrigLoop->contains(cast(IncomingValue)) && + any_of(IncomingValue->users(), [&Inductions](User *U) { + auto *P = dyn_cast(U); + return P && Inductions.contains(P); + }))) { + if (ExitVPBB->getSinglePredecessor() == MiddleVPBB) + continue; + } + ExitUsersToFix.insert(ExitIRI); + ExitIRI->addOperand(V); + } } } return ExitUsersToFix; } // Add exit values to \p Plan. Extracts are added for each entry in \p -// ExitUsersToFix if needed and their operands are updated. -static void +// ExitUsersToFix if needed and their operands are updated. Returns true if all +// exit users can be handled, otherwise return false. +static bool addUsersInExitBlocks(VPlan &Plan, const SetVector &ExitUsersToFix) { if (ExitUsersToFix.empty()) - return; + return true; auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); @@ -8988,14 +9008,18 @@ addUsersInExitBlocks(VPlan &Plan, if (V->isLiveIn()) continue; - assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && - "Exit value not handled yet for this edge."); + // Currently only live-ins can be used by exit values from blocks not + // exiting via the vector latch through to the middle block. + if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) + return false; + LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, {V, Plan.getOrAddLiveIn(ConstantInt::get( IntegerType::get(Ctx, 32), 1))}); ExitIRI->setOperand(0, Ext); } + return true; } /// Handle users in the exit block for first order reductions in the original @@ -9268,11 +9292,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + if (auto *UncountableExitingBlock = + Legal->getUncountableEarlyExitingBlock()) { + VPlanTransforms::handleUncountableEarlyExit( + *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); + } addScalarResumePhis(RecipeBuilder, *Plan); SetVector ExitUsersToFix = collectUsersInExitBlocks( OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); - addUsersInExitBlocks(*Plan, ExitUsersToFix); + if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { + reportVectorizationFailure( + "Some exit values in loop with uncountable exit not supported yet", + "Some exit values in loop with uncountable exit not supported yet", + "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); + return nullptr; + } + // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -10138,12 +10174,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasUncountableEarlyExit()) { + if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { reportVectorizationFailure("Auto-vectorization of loops with uncountable " - "early exit is not yet supported", + "early exit is not enabled", "Auto-vectorization of loops with uncountable " - "early exit is not yet supported", - "UncountableEarlyExitLoopsUnsupported", ORE, L); + "early exit is not enabled", + "UncountableEarlyExitLoopsDisabled", ORE, L); return false; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5122232ffe9b8e..81c76bc99fbf74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -861,14 +861,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, auto Plan = std::make_unique(Entry, VecPreheader, ScalarHeader); // Create SCEV and VPValue for the trip count. - - // Currently only loops with countable exits are vectorized, but calling - // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with - // uncountable exits whilst also ensuring the symbolic maximum and known - // back-edge taken count remain identical for loops with countable exits. + // We use the symbolic max backedge-taken-count, which works also when + // vectorizing loops with uncountable early exits. const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); - assert((!isa(BackedgeTakenCountSCEV) && - BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) && + assert(!isa(BackedgeTakenCountSCEV) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, @@ -903,7 +899,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // 2) If we require a scalar epilogue, there is no conditional branch as // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. - BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); + BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 8794517b777f3b..7440a3a386fd2d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -621,6 +621,14 @@ class VPBlockBase { /// Remove all the successors of this block. void clearSuccessors() { Successors.clear(); } + /// Swap successors of the block. The block must have exactly 2 successors. + // TODO: This should be part of introducing conditional branch recipes rather + // than being independent. + void swapSuccessors() { + assert(Successors.size() == 2 && "must have 2 successors to swap"); + std::swap(Successors[0], Successors[1]); + } + /// The method which generates the output IR that correspond to this /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(VPTransformState *State) = 0; @@ -1232,6 +1240,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // Returns a scalar boolean value, which is true if any lane of its single + // operand is true. + AnyOf, }; private: @@ -3884,10 +3895,10 @@ class VPlan { /// whether to execute the scalar tail loop or the exit block from the loop /// latch. const VPBasicBlock *getMiddleBlock() const { - return cast(getVectorLoopRegion()->getSingleSuccessor()); + return cast(getScalarPreheader()->getSinglePredecessor()); } VPBasicBlock *getMiddleBlock() { - return cast(getVectorLoopRegion()->getSingleSuccessor()); + return cast(getScalarPreheader()->getSinglePredecessor()); } /// Return the VPBasicBlock for the preheader of the scalar loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e882368544e815..8fea2ca9461047 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -57,6 +57,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case Instruction::Or: case Instruction::ICmp: case Instruction::Select: + case VPInstruction::AnyOf: case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: @@ -361,6 +362,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: + case VPInstruction::AnyOf: return true; default: return false; @@ -639,6 +641,10 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } + case VPInstruction::AnyOf: { + Value *A = State.get(getOperand(0)); + return Builder.CreateOrReduce(A); + } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -647,7 +653,8 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ComputeReductionResult; + getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::AnyOf; } bool VPInstruction::isSingleScalar() const { @@ -710,6 +717,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return false; case Instruction::ICmp: case Instruction::Select: + case Instruction::Or: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); @@ -805,6 +813,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::AnyOf: + O << "any-of"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -822,12 +833,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, void VPIRInstruction::execute(VPTransformState &State) { assert((isa(&I) || getNumOperands() == 0) && "Only PHINodes can have extra operands"); - if (getNumOperands() == 1) { - VPValue *ExitValue = getOperand(0); + for (const auto &[Idx, Op] : enumerate(operands())) { + VPValue *ExitValue = Op; auto Lane = vputils::isUniformAfterVectorization(ExitValue) ? VPLane::getFirstLane() : VPLane::getLastLaneForVF(State.VF); - auto *PredVPBB = cast(getParent()->getSinglePredecessor()); + VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; + auto *PredVPBB = Pred->getExitingBasicBlock(); BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; // Set insertion point in PredBB in case an extract needs to be generated. // TODO: Model extracts explicitly. @@ -860,11 +872,13 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, O << Indent << "IR " << I; if (getNumOperands() != 0) { - assert(getNumOperands() == 1 && "can have at most 1 operand"); - O << " (extra operand: "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << " from "; - getParent()->getPredecessors()[0]->printAsOperand(O); + O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": "; + interleaveComma( + enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) { + Op.value()->printAsOperand(O, SlotTracker); + O << " from "; + getParent()->getPredecessors()[Op.index()]->printAsOperand(O); + }); O << ")"; } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 922cba7831f4e9..e27c1bfba93525 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1858,3 +1858,62 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { } } } + +void VPlanTransforms::handleUncountableEarlyExit( + VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, + BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto *LatchVPBB = cast(LoopRegion->getExiting()); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *MiddleVPBB = Plan.getMiddleBlock(); + VPValue *IsEarlyExitTaken = nullptr; + + // Process the uncountable exiting block. Update IsEarlyExitTaken, which + // tracks if the uncountable early exit has been taken. Also split the middle + // block and have it conditionally branch to the early exit block if + // EarlyExitTaken. + auto *EarlyExitingBranch = + cast(UncountableExitingBlock->getTerminator()); + BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0); + BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1); + + // The early exit block may or may not be the same as the "countable" exit + // block. Creates a new VPIRBB for the early exit block in case it is distinct + // from the countable exit block. + // TODO: Introduce both exit blocks during VPlan skeleton construction. + VPIRBasicBlock *VPEarlyExitBlock; + if (OrigLoop->getUniqueExitBlock()) { + VPEarlyExitBlock = cast(MiddleVPBB->getSuccessors()[0]); + } else { + VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock( + !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + } + + VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask( + OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond); + IsEarlyExitTaken = + Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); + + VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split"); + VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); + VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); + NewMiddle->swapSuccessors(); + + VPBuilder MiddleBuilder(NewMiddle); + MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); + + // Replace the condition controlling the non-early exit from the vector loop + // with one exiting if either the original condition of the vector latch is + // true or the early exit has been taken. + auto *LatchExitingBranch = cast(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + auto *AnyExitTaken = Builder.createNaryOp( + Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 9cf314a6a9f447..fddde868911665 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -124,6 +124,17 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); + /// Update \p Plan to account for the uncountable early exit block in \p + /// UncountableExitingBlock by + /// * updating the condition exiting the vector loop to include the early + /// exit conditions + /// * splitting the original middle block to branch to the early exit block + /// if taken. + static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + BasicBlock *UncountableExitingBlock, + VPRecipeBuilder &RecipeBuilder); + /// Lower abstract recipes to concrete ones, that can be codegen'd. static void convertToConcreteRecipes(VPlan &Plan); }; diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll index 21433477c1d7a3..2a99693523d3cf 100644 --- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; REQUIRES: asserts -; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -disable-output 2>&1 | FileCheck %s declare void @init_mem(ptr, i64); @@ -11,7 +11,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check' ; CHECK: Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32))) ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; CHECK-NOT: LV: Not vectorizing: entry: %p1 = alloca [1024 x i32] %p2 = alloca [1024 x i32] @@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; CHECK-NOT: LV: Not vectorizing entry: %p1 = alloca [1024 x i8] %p2 = alloca [1024 x i8] @@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported. +; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. entry: %p1 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 52f82d007de4df..08a333fa865154 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 | FileCheck %s declare void @init_mem(ptr, i64); @@ -11,21 +11,47 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP1]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_SPLIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -62,19 +88,45 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.early.exit: ; CHECK-NEXT: ret i64 0 ; CHECK: loop.end: @@ -119,22 +171,66 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) ; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[END]] to i10 +; CHECK-NEXT: [[TMP20:%.*]] = zext i10 [[TMP19]] to i64 +; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP20]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = add i8 1, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP16]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] +; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND]], label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 ; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: found: ; CHECK-NEXT: ret i32 1 ; CHECK: exit: @@ -183,14 +279,33 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SVAL:%.*]] = load i32, ptr [[S]], align 4 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[SVAL]], 0 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276 +; CHECK-NEXT: [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP1]], label [[EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 266, [[MIDDLE_BLOCK]] ], [ -10, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT]] ; CHECK: for.inc: ; CHECK-NEXT: [[IND_NEXT]] = add nsw i32 [[IND]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: early.exit: ; CHECK-NEXT: tail call void @abort() ; CHECK-NEXT: unreachable @@ -218,3 +333,15 @@ early.exit: for.end: ret i32 0 } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll new file mode 100644 index 00000000000000..c45634913ce0b2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s + +declare void @init(ptr) + +define i64 @multi_exiting_to_different_exits_live_in_exit_values() { +; CHECK: multi_exiting_to_different_exits_live_in_exit_values +; CHECK-LABEL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %src = alloca [128 x i32], align 4 +; CHECK-NEXT: IR call void @init(ptr %src) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10> +; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]> +; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.split +; CHECK-EMPTY: +; CHECK-NEXT: middle.split: +; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] +; CHECK: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0> from middle.split) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + %src = alloca [128 x i32] + call void @init(ptr %src) + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src + %c.1 = icmp eq i32 %l, 10 + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header + +e1: + %p1 = phi i64 [ 0, %loop.header ] + ret i64 %p1 + +e2: + %p2 = phi i64 [ 1, %loop.latch ] + ret i64 %p2 +} + +define i64 @multi_exiting_to_same_exit_live_in_exit_values() { +; CHECK: multi_exiting_to_same_exit_live_in_exit_values +; CHECK-LABEL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %src = alloca [128 x i32], align 4 +; CHECK-NEXT: IR call void @init(ptr %src) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10> +; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]> +; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.split +; CHECK-EMPTY: +; CHECK-NEXT: middle.split: +; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] +; CHECK: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split) +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +entry: + %src = alloca [128 x i32] + call void @init(ptr %src) + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src + %c.1 = icmp eq i32 %l, 10 + br i1 %c.1, label %exit, label %loop.latch + +loop.latch: + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %exit, label %loop.header + +exit: + %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i64 %p +} + +define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() { +; CHECK: multi_exiting_to_same_exit_live_in_exit_values_2 +; CHECK-LABEL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %src = alloca [128 x i32], align 4 +; CHECK-NEXT: IR call void @init(ptr %src) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10> +; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]> +; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.split +; CHECK-EMPTY: +; CHECK-NEXT: middle.split: +; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] +; CHECK: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split) +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +entry: + %src = alloca [128 x i32] + call void @init(ptr %src) + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src + %c.1 = icmp eq i32 %l, 10 + br i1 %c.1, label %exit, label %loop.latch + +loop.latch: + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %exit, label %loop.header + +exit: + %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i64 %p + +; uselistorder directives + uselistorder label %exit, { 1, 0 } +} diff --git a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll index cd91d07120f9ee..5b2a95f1b368c3 100644 --- a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 | FileCheck %s declare void @init_mem(ptr, i64);