From 47258deea863675e43fd7fd48376dce131441dc5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Sep 2024 21:35:57 +0100 Subject: [PATCH] [VPlan] Dispatch to multiple exit blocks via middle blocks. A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193, which dispatches to multiple exit blocks via the middle blocks. --- .../Vectorize/LoopVectorizationLegality.h | 3 + .../Vectorize/LoopVectorizationLegality.cpp | 29 +++ .../Transforms/Vectorize/LoopVectorize.cpp | 82 +++--- llvm/lib/Transforms/Vectorize/VPlan.cpp | 39 ++- llvm/lib/Transforms/Vectorize/VPlan.h | 1 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 82 ++++++ .../Transforms/Vectorize/VPlanTransforms.h | 4 + .../Transforms/Vectorize/VPlanVerifier.cpp | 8 - .../LoopVectorize/X86/multi-exit-codegen.ll | 240 ++++++++++++++++++ .../LoopVectorize/X86/multi-exit-cost.ll | 18 +- .../LoopVectorize/X86/multi-exit-vplan.ll | 148 +++++++++++ 12 files changed, 614 insertions(+), 56 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index dc7e484a40a452..af6fae44cf0f09 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -287,6 +287,9 @@ class LoopVectorizationLegality { /// we can use in-order reductions. bool canVectorizeFPMath(bool EnableStrictReductions); + /// Returns true if the loop has an early exit that we can vectorize. + bool canVectorizeEarlyExit() const; + /// Return true if we can vectorize this loop while folding its tail by /// masking. bool canFoldTailByMasking() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 43be72f0f34d45..ee53d28a4c8282 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables.")); +static cl::opt + EnableEarlyExitVectorization("enable-early-exit-vectorization", + cl::init(false), cl::Hidden, cl::desc("")); + namespace llvm { cl::opt HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, @@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence( } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { + // When vectorizing early exits, create predicates for all blocks, except the + // header. + if (canVectorizeEarlyExit() && BB != TheLoop->getHeader()) + return true; return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { return true; } +bool LoopVectorizationLegality::canVectorizeEarlyExit() const { + // Currently only allow vectorizing loops with early exits, if early-exit + // vectorization is explicitly enabled and the loop has metadata to force + // vectorization. + if (!EnableEarlyExitVectorization) + return false; + + SmallVector Exiting; + TheLoop->getExitingBlocks(Exiting); + if (Exiting.size() == 1) + return false; + + LoopVectorizeHints Hints(TheLoop, true, *ORE); + if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) + return false; + + Function *Fn = TheLoop->getHeader()->getParent(); + return Hints.allowVectorization(Fn, TheLoop, + true /*VectorizeOnlyWhenForced*/); +} + // Helper function to canVectorizeLoopNestCFG. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e8653498d32a12..befe8f7c0076a3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel { // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - LLVM_DEBUG( - dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); - return true; + if (!Legal->canVectorizeEarlyExit()) { + LLVM_DEBUG( + dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); + return true; + } } if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " @@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr - assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && + assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) || + Legal->canVectorizeEarlyExit()) && "multiple exit loop without required epilogue?"); LoopMiddleBlock = @@ -2758,8 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, // value (the value that feeds into the phi from the loop latch). // We allow both, but they, obviously, have different values. - assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); - DenseMap MissingVals; // An external user of the last iteration's value should see the value that @@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, if (PHI->getBasicBlockIndex(MiddleBlock) == -1) PHI->addIncoming(I.second, MiddleBlock); } + + assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) && + "Expected a single exit block"); } namespace { @@ -3599,7 +3603,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { TheLoop->getExitingBlocks(Exiting); for (BasicBlock *E : Exiting) { auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() && + (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit())) AddToWorklistIfAllowed(Cmp); } @@ -7692,12 +7697,15 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); // 2.5 Collect reduction resume values. - auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + VPBasicBlock *ExitVPBB = nullptr; + if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) { + ExitVPBB = cast( + BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + for (VPRecipeBase &R : *ExitVPBB) { + createAndCollectMergePhiForReduction( + dyn_cast(&R), State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + } } // 2.6. Maintain Loop Hints @@ -7723,6 +7731,7 @@ DenseMap LoopVectorizationPlanner::executePlan( LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } + TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) @@ -7735,15 +7744,17 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (ExitVPBB) { + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; @@ -8128,7 +8139,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { // If source is an exiting block, we know the exit edge is dynamically dead // in the vector loop, and thus we don't need to restrict the mask. Avoid // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) + if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); @@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { + if (!Plan.getVectorLoopRegion()->getSingleSuccessor()) + return {}; auto *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); // No edge from the middle block to the unique exit block has been inserted @@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences( // TODO: Should be replaced by // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the // scalar region is modeled as well. + if (!VectorRegion->getSingleSuccessor()) + return; auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); VPBasicBlock *ScalarPHVPBB = nullptr; if (MiddleVPBB->getNumSuccessors() == 2) { @@ -9146,10 +9161,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - SetVector ExitUsersToFix = collectUsersInExitBlock( - OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); - addUsersInExitBlock(*Plan, ExitUsersToFix); + if (Legal->canVectorizeEarlyExit()) { + VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop, + RecipeBuilder); + } else { + SetVector ExitUsersToFix = collectUsersInExitBlock( + OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); + addUsersInExitBlock(*Plan, ExitUsersToFix); + } // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -9277,8 +9297,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); - VPBasicBlock *MiddleVPBB = - cast(VectorLoopRegion->getSingleSuccessor()); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) @@ -9297,8 +9315,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { - assert(UserRecipe->getParent() == MiddleVPBB && - "U must be either in the loop region or the middle block."); continue; } Worklist.insert(UserRecipe); @@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*LatchVPBB->begin()); + if (!VectorLoopRegion->getSingleSuccessor()) + return; + VPBasicBlock *MiddleVPBB = + cast(VectorLoopRegion->getSingleSuccessor()); VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8609514c39e7d0..eb7c808551340d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) { // backedges. A backward successor is set when the branch is created. const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; + if (TermBr->getSuccessor(idx) && + PredVPBlock == getPlan()->getVectorLoopRegion() && + PredVPBlock->getNumSuccessors()) { + // Update PRedBB and TermBr for BranchOnMultiCond in predecessor. + PredBB = TermBr->getSuccessor(1); + TermBr = cast(PredBB->getTerminator()); + idx = 0; + } assert(!TermBr->getSuccessor(idx) && "Trying to reset an existing successor block."); TermBr->setSuccessor(idx, IRBB); @@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); - VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); if (!RequiresScalarEpilogueCheck) { + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); return Plan; } @@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); - auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); - // The connection order corresponds to the operands of the conditional branch. - VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); - VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + if (IRExitBlock) { + auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); + // The connection order corresponds to the operands of the conditional + // branch. + VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + } auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator(); // Here we use the same DebugLoc as the scalar loop latch terminator instead @@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) { // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); + getVectorLoopRegion()->getNumSuccessors() == 1 + ? cast(getVectorLoopRegion()->getSuccessors()[0]) + : cast(getVectorLoopRegion()->getSuccessors()[1]); // Find the VPBB for the scalar preheader, relying on the current structure // when creating the middle block and its successrs: if there's a single // predecessor, it must be the scalar preheader. Otherwise, the second @@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) { MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]); assert(!isa(ScalarPhVPBB) && "scalar preheader cannot be wrapped already"); + if (ScalarPhVPBB->getNumSuccessors() != 0) { + ScalarPhVPBB = cast(ScalarPhVPBB->getSuccessors()[1]); + MiddleVPBB = cast(MiddleVPBB->getSuccessors()[1]); + } replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh); replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); @@ -1065,6 +1083,10 @@ void VPlan::execute(VPTransformState *State) { VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + if (!getVectorLoopRegion()->getSingleSuccessor()) + VectorLatchBB = + cast(VectorLatchBB->getTerminator())->getSuccessor(1); + // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); @@ -1091,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) { // Move the last step to the end of the latch block. This ensures // consistent placement of all induction updates. Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI()) + Inc->moveBefore(VectorLatchBB->getTerminator()); + else + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); // Use the steps for the last part as backedge value for the induction. if (auto *IV = dyn_cast(&R)) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 59a084401cc9bf..21f44eac188936 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + AnyOf, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index be3e958320e771..9d5c609ad26043 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const { default: return true; } + case VPExpandSCEVSC: + return getParent()->getPlan()->getTripCount() == getVPSingleValue(); case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPScalarCastSC: return false; + case VPExpandSCEVSC: + return getParent()->getPlan()->getTripCount() == getVPSingleValue(); case VPInstructionSC: return mayWriteToMemory(); case VPWidenCallSC: { @@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: + case VPInstruction::AnyOf: return true; default: return false; @@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } + case VPInstruction::AnyOf: { + Value *A = State.get(getOperand(0)); + return Builder.CreateOrReduce(A); + } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ComputeReductionResult; + getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::AnyOf; } bool VPInstruction::isSingleScalar() const { @@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return false; case Instruction::ICmp: case Instruction::Select: + case Instruction::Or: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); @@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::AnyOf: + O << "any-of"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d50f3c0c3f3e04..a86498eb9aa30c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); + for (VPRecipeBase &R : make_early_inc_range( + reverse(*cast(Plan.getPreheader())))) { + if (isDeadRecipe(R)) + R.eraseFromParent(); + } + for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly(RPOT))) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. @@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups( } } } + +void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + VPRecipeBuilder &RecipeBuilder) { + auto *LatchVPBB = + cast(Plan.getVectorLoopRegion()->getExiting()); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + + const SCEV *BackedgeTakenCount = + SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + const SCEV *TripCount = SE.getTripCountFromExitCount( + BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop); + VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE); + Plan.getTripCount()->replaceAllUsesWith(NewTC); + Plan.resetTripCount(NewTC); + + VPValue *EarlyExitTaken = nullptr; + SmallVector ExitingBBs; + OrigLoop->getExitingBlocks(ExitingBBs); + for (BasicBlock *Exiting : ExitingBBs) { + auto *ExitingTerm = cast(Exiting->getTerminator()); + BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0); + BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1); + VPIRBasicBlock *VPExitBlock; + if (OrigLoop->getUniqueExitBlock()) + VPExitBlock = cast(MiddleVPBB->getSuccessors()[0]); + else + VPExitBlock = VPIRBasicBlock::fromBasicBlock( + !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + + for (VPRecipeBase &R : *VPExitBlock) { + auto *ExitIRI = cast(&R); + auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); + if (!ExitPhi) + break; + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting); + VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue); + ExitIRI->addOperand(V); + } + + if (Exiting == OrigLoop->getLoopLatch()) { + if (MiddleVPBB->getNumSuccessors() == 0) { + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock); + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + } + continue; + } + + VPValue *M = RecipeBuilder.getBlockInMask( + OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + auto *N = Builder.createNot(M); + EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N}); + + VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split"); + VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB); + VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion); + VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock); + VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB); + + VPBuilder MiddleBuilder(NewMiddle); + MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken}); + // MiddleVPBB = NewMiddle; + } + auto *Term = dyn_cast(LatchVPBB->getTerminator()); + auto *IsLatchExiting = Builder.createICmp( + CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1)); + auto *AnyExiting = + Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting); + Term->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 60a44bfb0dca6b..9745211db275f0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -123,6 +123,10 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); + + static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7ea5ee341cc547..1ac79f8887ab46 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } - VPBlockBase *MiddleBB = - IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor(); - if (IRBB != IRBB->getPlan()->getPreheader() && - IRBB->getSinglePredecessor() != MiddleBB) { - errs() << "VPIRBasicBlock can only be used as pre-header or a successor of " - "middle-block at the moment!\n"; - return false; - } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll new file mode 100644 index 00000000000000..0c33715c6bd271 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s + +define i64 @multi_exit_with_store(ptr %p, i64 %N) { +; MULTI-LABEL: define i64 @multi_exit_with_store( +; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; MULTI-NEXT: [[ENTRY:.*]]: +; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MULTI: [[VECTOR_PH]]: +; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; MULTI-NEXT: br label %[[VECTOR_BODY:.*]] +; MULTI: [[VECTOR_BODY]]: +; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]] +; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 +; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]]) +; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], +; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; MULTI: [[MIDDLE_SPLIT]]: +; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]] +; MULTI: [[MIDDLE_BLOCK]]: +; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]] +; MULTI: [[SCALAR_PH]]: +; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MULTI-NEXT: br label %[[LOOP_HEADER:.*]] +; MULTI: [[LOOP_HEADER]]: +; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]] +; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]] +; MULTI: [[LOOP_LATCH]]: +; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]] +; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1 +; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128 +; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; MULTI: [[E1]]: +; MULTI-NEXT: ret i64 0 +; MULTI: [[E2]]: +; MULTI-NEXT: ret i64 1 +; +; DEFAULT-LABEL: define i64 @multi_exit_with_store( +; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127) +; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]] +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]] +; DEFAULT: [[LOOP_HEADER]]: +; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]] +; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]] +; DEFAULT: [[LOOP_LATCH]]: +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]] +; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1 +; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128 +; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; DEFAULT: [[E1]]: +; DEFAULT-NEXT: ret i64 0 +; DEFAULT: [[E2]]: +; DEFAULT-NEXT: ret i64 1 +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1 + +e1: + ret i64 0 + +e2: + ret i64 1 +} + +define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) { +; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store( +; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; MULTI-NEXT: [[ENTRY:.*]]: +; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MULTI: [[VECTOR_PH]]: +; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; MULTI-NEXT: br label %[[VECTOR_BODY:.*]] +; MULTI: [[VECTOR_BODY]]: +; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]] +; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0 +; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]]) +; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], +; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; MULTI: [[MIDDLE_SPLIT]]: +; MULTI-NEXT: br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]] +; MULTI: [[MIDDLE_BLOCK]]: +; MULTI-NEXT: br i1 true, label %[[E]], label %[[SCALAR_PH]] +; MULTI: [[SCALAR_PH]]: +; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MULTI-NEXT: br label %[[LOOP_HEADER:.*]] +; MULTI: [[LOOP_HEADER]]: +; MULTI-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; MULTI-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]] +; MULTI-NEXT: br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]] +; MULTI: [[LOOP_LATCH]]: +; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]] +; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; MULTI-NEXT: [[INC]] = add nuw i64 [[IV]], 1 +; MULTI-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128 +; MULTI-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; MULTI: [[E]]: +; MULTI-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ] +; MULTI-NEXT: ret i64 [[P1]] +; +; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store( +; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127) +; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]] +; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]] +; DEFAULT: [[LOOP_HEADER]]: +; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; DEFAULT-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]] +; DEFAULT-NEXT: br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]] +; DEFAULT: [[LOOP_LATCH]]: +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]] +; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INC]] = add nuw i64 [[IV]], 1 +; DEFAULT-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128 +; DEFAULT-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT: [[E]]: +; DEFAULT-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ] +; DEFAULT-NEXT: ret i64 [[P1]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e, label %loop.header, !llvm.loop !1 + +e: + %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i64 %p1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index cd128979fc1431..1c02f10753745c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] -; CHECK-NEXT: [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]]) -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; CHECK: [[VECTOR_SCEVCHECK]]: ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]]) -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[N]], -1 +; CHECK-NEXT: [[TMP33:%.*]] = freeze i64 [[TMP32]] +; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]]) +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll new file mode 100644 index 00000000000000..5c5d532b93bc89 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s + +define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) { +; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N> +; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]> +; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.split +; CHECK-EMPTY: +; CHECK-NEXT: middle.split: +; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1 + +e1: + %p1 = phi i64 [ 0, %loop.header ] + ret i64 %p1 + +e2: + %p2 = phi i64 [ 1, %loop.latch ] + ret i64 %p2 +} + +define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) { +; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N> +; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]> +; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]> +; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.split +; CHECK-EMPTY: +; CHECK-NEXT: middle.split: +; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e, label %loop.header, !llvm.loop !1 + +e: + %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i64 %p1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true}