diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4282f815849a83..b03dd471bd9716 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2383,12 +2383,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, // End if-block. VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; - assert((Parent || all_of(RepRecipe->operands(), - [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - })) && - "Expected a recipe is either within a region or all of its operands " - "are defined outside the vectorized region."); + assert( + (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || + all_of(RepRecipe->operands(), + [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && + "Expected a recipe is either within a region or all of its operands " + "are defined outside the vectorized region."); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } @@ -2448,6 +2448,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { /// scalar preheader. static void introduceCheckBlockInVPlan(VPlan &Plan, BasicBlock *CheckIRBB) { VPBlockBase *ScalarPH = Plan.getScalarPreheader(); + // FIXME: Cannot get the vector preheader at the moment if the vector loop + // region has been removed. VPBlockBase *VectorPH = Plan.getVectorPreheader(); VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); if (PreVectorPH->getNumSuccessors() != 1) { @@ -2988,6 +2990,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); } + if (!State.Plan->getVectorLoopRegion()) + return; + for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -7790,24 +7795,25 @@ DenseMap LoopVectorizationPlanner::executePlan( makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); - VPBasicBlock *HeaderVPBB = - BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); - Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); - if (VectorizedLoopID) - L->setLoopID(*VectorizedLoopID); - else { - // Keep all loop hints from the original loop on the vector loop (we'll - // replace the vectorizer-specific hints below). - if (MDNode *LID = OrigLoop->getLoopID()) - L->setLoopID(LID); - - LoopVectorizeHints Hints(L, true, *ORE); - Hints.setAlreadyVectorized(); + if (auto *R = BestVPlan.getVectorLoopRegion()) { + VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock(); + Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); + if (VectorizedLoopID) { + L->setLoopID(*VectorizedLoopID); + } else { + // Keep all loop hints from the original loop on the vector loop (we'll + // replace the vectorizer-specific hints below). + if (MDNode *LID = OrigLoop->getLoopID()) + L->setLoopID(LID); + + LoopVectorizeHints Hints(L, true, *ORE); + Hints.setAlreadyVectorized(); + } + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) + addRuntimeUnrollDisableMetaData(L); } - TargetTransformInfo::UnrollingPreferences UP; - TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); - if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) - addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7816,15 +7822,19 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (auto *R = BestVPlan.getVectorLoopRegion()) { + auto *ExitVPBB = cast(R->getSingleSuccessor()); + + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0619f47f77cbe9..65a3f046422620 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -447,7 +447,6 @@ void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) { CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}}); } } - void VPIRBasicBlock::execute(VPTransformState *State) { assert(getHierarchicalSuccessors().size() <= 2 && "VPIRBasicBlock can have at most two successors at the moment!"); @@ -487,7 +486,7 @@ void VPBasicBlock::execute(VPTransformState *State) { }; // 1. Create an IR basic block. - if (this == getPlan()->getVectorPreheader() || + if (this == getPlan()->getEntry() || (Replica && this == getParent()->getEntry()) || IsReplicateRegion(getSingleHierarchicalPredecessor())) { // Reuse the previous basic block if the current VPBB is either @@ -557,7 +556,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { template static T *getEnclosingLoopRegionForRegion(T *P) { if (P && P->isReplicator()) { P = P->getParent(); - assert(!cast(P)->isReplicator() && + assert((!P || !cast(P)->isReplicator()) && "unexpected nested replicate regions"); } return P; @@ -936,7 +935,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. - assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); unsigned UF = getUF(); if (VF.getNumUsers()) { Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); @@ -1015,49 +1013,51 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : RPOT) Block->execute(State); - VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); - BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - - // Fix the latch value of canonical, reduction and first-order recurrences - // phis in the vector loop. - VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - // Skip phi-like recipes that generate their backedege values themselves. - if (isa(&R)) - continue; - - if (isa(&R)) { - PHINode *Phi = nullptr; - if (isa(&R)) { - Phi = cast(State->get(R.getVPSingleValue())); - } else { - auto *WidenPhi = cast(&R); - assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && - "recipe generating only scalars should have been replaced"); - auto *GEP = cast(State->get(WidenPhi)); - Phi = cast(GEP->getPointerOperand()); - } + if (auto *LoopRegion = getVectorLoopRegion()) { + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; - Phi->setIncomingBlock(1, VectorLatchBB); + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + if (isa(&R)) + continue; - // Move the last step to the end of the latch block. This ensures - // consistent placement of all induction updates. - Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + if (isa(&R)) { + PHINode *Phi = nullptr; + if (isa(&R)) { + Phi = cast(State->get(R.getVPSingleValue())); + } else { + auto *WidenPhi = cast(&R); + assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) && + "recipe generating only scalars should have been replaced"); + auto *GEP = cast(State->get(WidenPhi)); + Phi = cast(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + + // Use the steps for the last part as backedge value for the induction. + if (auto *IV = dyn_cast(&R)) + Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); + continue; + } - // Use the steps for the last part as backedge value for the induction. - if (auto *IV = dyn_cast(&R)) - Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand())); - continue; + auto *PhiR = cast(&R); + bool NeedsScalar = isa(PhiR) || + (isa(PhiR) && + cast(PhiR)->isInLoop()); + Value *Phi = State->get(PhiR, NeedsScalar); + Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); + cast(Phi)->addIncoming(Val, VectorLatchBB); } - - auto *PhiR = cast(&R); - bool NeedsScalar = isa(PhiR) || - (isa(PhiR) && - cast(PhiR)->isInLoop()); - Value *Phi = State->get(PhiR, NeedsScalar); - Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar); - cast(Phi)->addIncoming(Val, VectorLatchBB); } State->CFG.DTU.flush(); @@ -1069,19 +1069,28 @@ InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { return getVectorLoopRegion()->cost(VF, Ctx); } +VPBasicBlock *VPlan::getVectorPreheader() { + VPBlockBase *Current = getEntry()->getSuccessors().back(); + while (Current->getNumSuccessors() == 2) + Current = Current->getSuccessors().back(); + return cast(Current); +} + +VPBasicBlock *VPlan::getVectorPreheader() const { + VPBlockBase *Current = getEntry()->getSuccessors().back(); + while (Current->getNumSuccessors() == 2) + Current = Current->getSuccessors().back(); + return cast(Current); +} + VPRegionBlock *VPlan::getVectorLoopRegion() { // TODO: Cache if possible. - for (VPBlockBase *B : vp_depth_first_shallow(getEntry())) - if (auto *R = dyn_cast(B)) - return R; - return nullptr; + return dyn_cast(getVectorPreheader()->getSingleSuccessor()); + ; } const VPRegionBlock *VPlan::getVectorLoopRegion() const { - for (const VPBlockBase *B : vp_depth_first_shallow(getEntry())) - if (auto *R = dyn_cast(B)) - return R; - return nullptr; + return dyn_cast(getVectorPreheader()->getSingleSuccessor()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1427,8 +1436,10 @@ void VPlanIngredient::print(raw_ostream &O) const { #endif bool VPValue::isDefinedOutsideLoopRegions() const { + return !hasDefiningRecipe() || - !getDefiningRecipe()->getParent()->getEnclosingLoopRegion(); + (!getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && + getDefiningRecipe()->getParent()->getPlan()->getVectorLoopRegion()); } void VPValue::replaceAllUsesWith(VPValue *New) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 88f3f672d3aa38..c15018dc49c1c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3668,8 +3668,6 @@ class VPRegionBlock : public VPBlockBase { const VPBlockBase *getEntry() const { return Entry; } VPBlockBase *getEntry() { return Entry; } - /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p - /// EntryBlock must have no predecessors. void setEntry(VPBlockBase *EntryBlock) { assert(EntryBlock->getPredecessors().empty() && "Entry block cannot have predecessors."); @@ -3845,9 +3843,8 @@ class VPlan { const VPBasicBlock *getEntry() const { return Entry; } /// Returns the preheader of the vector loop region. - VPBasicBlock *getVectorPreheader() { - return cast(getVectorLoopRegion()->getSinglePredecessor()); - } + VPBasicBlock *getVectorPreheader(); + VPBasicBlock *getVectorPreheader() const; /// Returns the VPRegionBlock of the vector loop. VPRegionBlock *getVectorLoopRegion(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 77c08839dbfa95..8b424ed055feff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2376,7 +2376,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { // Replace the temporary unreachable terminator with a new conditional branch, // whose two destinations will be set later when they are created. auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa(CurrentTerminator) && + assert((isa(CurrentTerminator) || + (isa(CurrentTerminator) && + !CurrentTerminator->getOperand(0))) && "Expected to replace unreachable terminator with conditional branch."); auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); CondBr->setSuccessor(0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 89aab71905a293..c59bd874487c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -794,11 +794,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); } -/// Try to simplify the recipes in \p Plan -static void simplifyRecipes(VPlan &Plan) { +/// Try to simplify the recipes in \p Plan. If \p CanonicalIVTy is not nullptr, +/// use it directly instead of retrieving the canonical IV type from the plan +/// which may not exist any longer. +static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy = nullptr) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); + Type *CanonicalIVType = + CanonicalIVTy ? CanonicalIVTy : Plan.getCanonicalIV()->getScalarType(); VPTypeAnalysis TypeInfo(CanonicalIVType); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { @@ -812,8 +815,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - VPBasicBlock *ExitingVPBB = - Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); // Try to simplify the branch condition if TC <= VF * UF when preparing to // execute the plan for the main vector loop. We only do this if the @@ -837,16 +840,42 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); - - SmallVector PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); - ExitingVPBB->appendRecipe(BOC); + auto *Header = cast(VectorRegion->getEntry()); + auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); + if (any_of(Header->phis(), + IsaPred)) { + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); + } else { + for (VPRecipeBase &R : make_early_inc_range(Header->phis())) { + auto *P = cast(&R); + P->replaceAllUsesWith(P->getStartValue()); + P->eraseFromParent(); + } + + VPBlockBase *Preheader = Plan.getVectorPreheader(); + VPBlockBase *Middle = Plan.getMiddleBlock(); + VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); + VPBlockUtils::disconnectBlocks(VectorRegion, Middle); + + Header->setParent(nullptr); + ExitingVPBB->setParent(nullptr); + + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) { + if (isa(B)) + B->setParent(nullptr); + } + VPBlockUtils::connectBlocks(Preheader, Header); + VPBlockUtils::connectBlocks(ExitingVPBB, Middle); + simplifyRecipes(Plan, CanIVTy); + } + + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index 4f050877bd1316..e63155b024c43a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -80,15 +80,13 @@ define void @powi_call(ptr %P) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3) -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -102,7 +100,7 @@ define void @powi_call(ptr %P) { ; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 5c5600b9cfdf8e..10ac870c112aef 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -49,28 +49,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 3) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 3) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -78,16 +76,16 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -118,28 +116,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 5) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 5) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -147,16 +143,16 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -187,28 +183,26 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP2]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST1:%.*]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -216,16 +210,16 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP15]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST1]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]] ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -256,19 +250,17 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -285,7 +277,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -317,19 +309,17 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -346,7 +336,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -390,7 +380,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -407,7 +397,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 375278eea38f97..3386a7d3972aa2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -14,16 +14,14 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0( [[TMP6]], ptr [[TMP8]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -31,13 +29,13 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -73,16 +71,14 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6:%.*]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 1) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -90,13 +86,13 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index bfdcfbf5139b3d..7a6676ff6ea1b3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -167,16 +167,10 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 0, i32 2) ; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -194,7 +188,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[ADD]] = add i8 [[F_039]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[F_039]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -293,7 +287,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT6]], i32 8, [[TMP8]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -314,7 +308,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[V]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -354,8 +348,7 @@ attributes #1 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 98245fcc0f0af3..2de0f7e4d40169 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -19,125 +19,103 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT2]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT3]], -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], splat (i32 9) -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: store i1 [[TMP10]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: store i1 [[TMP9]], ptr [[P]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2 ; CHECK-NEXT: store i1 [[TMP12]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3 ; CHECK-NEXT: store i1 [[TMP14]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4 ; CHECK-NEXT: store i1 [[TMP16]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5 ; CHECK-NEXT: store i1 [[TMP18]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6 ; CHECK-NEXT: store i1 [[TMP20]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7 ; CHECK-NEXT: store i1 [[TMP22]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8 ; CHECK-NEXT: store i1 [[TMP24]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; CHECK-NEXT: br i1 true, label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9 ; CHECK-NEXT: store i1 [[TMP26]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10 ; CHECK-NEXT: store i1 [[TMP28]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11 ; CHECK-NEXT: store i1 [[TMP30]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12 ; CHECK-NEXT: store i1 [[TMP32]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13 ; CHECK-NEXT: store i1 [[TMP34]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14 ; CHECK-NEXT: store i1 [[TMP36]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 false, label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] ; CHECK: pred.store.if29: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15 ; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.if31: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: store i1 [[TMP40]], ptr [[P]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -156,7 +134,7 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: store i1 [[ICMP_SGT]], ptr [[P]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10 -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -183,7 +161,6 @@ exit: ; preds = %for.body } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index fcf1ba072a62cb..61bcbaa1fe4d2e 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -16,30 +16,24 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw i64 [[TMP2]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[TMP3]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 0, 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 1, 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 2, 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw i64 3, 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 @@ -64,14 +58,13 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: store i32 [[TMP31]], ptr [[DST]], align 4 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -93,7 +86,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[LOOP_LATCH]]: ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -125,7 +118,6 @@ exit: } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index 61cae9c1b3f5dc..83e2f84814add5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -17,15 +17,11 @@ define void @f1() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x ptr> , ptr [[TMP3]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = sext i16 0 to i64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <2 x ptr> , ptr [[TMP2]], align 8 +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -40,7 +36,7 @@ define void @f1() { ; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index cc60359af2f8c5..7816c4918761f7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -16,18 +16,16 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -43,7 +41,7 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll index 04ce9562c04b57..0f34f6243f155a 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll @@ -11,22 +11,20 @@ define i32 @foo(ptr %p) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG3:![0-9]+]] -; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG7:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG3]] -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG3]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]], !dbg [[DBG7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG8:![0-9]+]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG3]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG9:![0-9]+]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG3]] -; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG12:![0-9]+]] -; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG7]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG13:![0-9]+]] -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG14:![0-9]+]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG9]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG7]] +; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG8]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 0 ; @@ -63,17 +61,16 @@ exit: ; preds = %loop ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) ; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}}) -; CHECK: [[DBG3]] = !DILocation(line: 4, scope: [[META4:![0-9]+]]) +; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]]) ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) ; CHECK: [[META6]] = !{} -; CHECK: [[DBG7]] = !DILocation(line: 6, scope: [[META4]]) -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]} -; CHECK: [[META9]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META10]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]]) -; CHECK: [[DBG12]] = !DILocation(line: 5, scope: [[META4]]) -; CHECK: [[DBG13]] = !DILocation(line: 7, scope: [[META4]]) -; CHECK: [[DBG14]] = !DILocation(line: 8, scope: [[META4]]) -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META9]]} +; CHECK: [[DBG7]] = !DILocation(line: 5, scope: [[META4]]) +; CHECK: [[DBG8]] = !DILocation(line: 9, scope: [[META4]]) +; CHECK: [[DBG9]] = !DILocation(line: 4, scope: [[META4]]) +; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]]) +; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]]) +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]} +; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1} ;. diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7f562a4f2c445d..509b8f99e4b61f 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -3707,13 +3707,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; UNROLL-NO-IC-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: @@ -3726,7 +3721,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: ret i32 0 ; @@ -3736,11 +3731,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[TMP0]] = load i32, ptr [[SRC:%.*]], align 4 -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; UNROLL-NO-VF-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3753,7 +3745,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-VF-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-VF: exit: ; UNROLL-NO-VF-NEXT: ret i32 0 ; @@ -3763,13 +3755,8 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] ; SINK-AFTER: vector.body: -; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ] ; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 -; SINK-AFTER-NEXT: [[BROADCAST_SPLAT]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SINK-AFTER-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: @@ -3782,7 +3769,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; SINK-AFTER-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP39:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] ; SINK-AFTER: exit: ; SINK-AFTER-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index 7e6e5249381cda..3de42080b1842a 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -63,19 +63,18 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1 ; VF8UF2-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) ; VF8UF2-NEXT: [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10) -; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1 -; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP3]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8UF2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1 +; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1 +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] ; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -92,7 +91,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -108,15 +107,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 0 ; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; VF16UF1-NEXT: [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10) -; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP2]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1 +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]] ; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -133,7 +130,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -168,82 +165,68 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF1: [[VECTOR_BODY]]: -; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ] -; VF8UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 -; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF1-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF1-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF1: [[PRED_STORE_IF]]: -; VF8UF1-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]] +; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF1: [[PRED_STORE_CONTINUE]]: ; VF8UF1-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF8UF1: [[PRED_STORE_IF3]]: -; VF8UF1-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF1: [[PRED_STORE_IF1]]: +; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF1: [[PRED_STORE_CONTINUE2]]: +; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 +; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF1: [[PRED_STORE_IF3]]: +; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF8UF1: [[PRED_STORE_CONTINUE4]]: -; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 +; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF8UF1: [[PRED_STORE_IF5]]: -; VF8UF1-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF8UF1: [[PRED_STORE_CONTINUE6]]: -; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 +; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF8UF1: [[PRED_STORE_IF7]]: -; VF8UF1-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF8UF1: [[PRED_STORE_CONTINUE8]]: -; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 +; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF8UF1: [[PRED_STORE_IF9]]: -; VF8UF1-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF8UF1: [[PRED_STORE_CONTINUE10]]: -; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 +; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF8UF1: [[PRED_STORE_IF11]]: -; VF8UF1-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF8UF1: [[PRED_STORE_CONTINUE12]]: -; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 +; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF8UF1: [[PRED_STORE_IF13]]: -; VF8UF1-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF8UF1: [[PRED_STORE_CONTINUE14]]: -; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_IF15]]: -; VF8UF1-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]] -; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 -; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] -; VF8UF1: [[PRED_STORE_CONTINUE16]]: -; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; VF8UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF1: [[SCALAR_PH]]: @@ -255,7 +238,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF1: [[EXIT]]: ; VF8UF1-NEXT: ret void ; @@ -270,148 +253,125 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; VF8UF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 -; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; VF8UF2-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], -; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]] +; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] +; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> , [[BROADCAST_SPLAT1]] ; VF8UF2-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0 ; VF8UF2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF8UF2: [[PRED_STORE_IF]]: -; VF8UF2-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF8UF2-NEXT: store i16 0, ptr [[TMP5]], align 2 ; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF8UF2: [[PRED_STORE_CONTINUE]]: ; VF8UF2-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; VF8UF2: [[PRED_STORE_IF6]]: -; VF8UF2-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] +; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF8UF2: [[PRED_STORE_IF1]]: +; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF8UF2-NEXT: store i16 0, ptr [[TMP7]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; VF8UF2: [[PRED_STORE_CONTINUE7]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF8UF2: [[PRED_STORE_CONTINUE2]]: ; VF8UF2-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; VF8UF2: [[PRED_STORE_IF8]]: -; VF8UF2-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] +; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF8UF2: [[PRED_STORE_IF3]]: +; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 4 ; VF8UF2-NEXT: store i16 0, ptr [[TMP9]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; VF8UF2: [[PRED_STORE_CONTINUE9]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE4]] +; VF8UF2: [[PRED_STORE_CONTINUE4]]: ; VF8UF2-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; VF8UF2: [[PRED_STORE_IF10]]: -; VF8UF2-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] +; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF8UF2: [[PRED_STORE_IF5]]: +; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 5 ; VF8UF2-NEXT: store i16 0, ptr [[TMP11]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; VF8UF2: [[PRED_STORE_CONTINUE11]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; VF8UF2: [[PRED_STORE_CONTINUE6]]: ; VF8UF2-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; VF8UF2: [[PRED_STORE_IF12]]: -; VF8UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] +; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF8UF2: [[PRED_STORE_IF7]]: +; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 6 ; VF8UF2-NEXT: store i16 0, ptr [[TMP13]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; VF8UF2: [[PRED_STORE_CONTINUE13]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF8UF2: [[PRED_STORE_CONTINUE8]]: ; VF8UF2-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; VF8UF2: [[PRED_STORE_IF14]]: -; VF8UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] +; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF8UF2: [[PRED_STORE_IF9]]: +; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 7 ; VF8UF2-NEXT: store i16 0, ptr [[TMP15]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; VF8UF2: [[PRED_STORE_CONTINUE15]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF8UF2: [[PRED_STORE_CONTINUE10]]: ; VF8UF2-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; VF8UF2: [[PRED_STORE_IF16]]: -; VF8UF2-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] +; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF8UF2: [[PRED_STORE_IF11]]: +; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 8 ; VF8UF2-NEXT: store i16 0, ptr [[TMP17]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; VF8UF2: [[PRED_STORE_CONTINUE17]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF8UF2: [[PRED_STORE_CONTINUE12]]: ; VF8UF2-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; VF8UF2: [[PRED_STORE_IF18]]: -; VF8UF2-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] +; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF8UF2: [[PRED_STORE_IF13]]: +; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 9 ; VF8UF2-NEXT: store i16 0, ptr [[TMP19]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; VF8UF2: [[PRED_STORE_CONTINUE19]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; VF8UF2: [[PRED_STORE_CONTINUE14]]: ; VF8UF2-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; VF8UF2: [[PRED_STORE_IF20]]: -; VF8UF2-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] +; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF8UF2: [[PRED_STORE_IF15]]: +; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 10 ; VF8UF2-NEXT: store i16 0, ptr [[TMP21]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; VF8UF2: [[PRED_STORE_CONTINUE21]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; VF8UF2: [[PRED_STORE_CONTINUE16]]: ; VF8UF2-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 -; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; VF8UF2: [[PRED_STORE_IF22]]: -; VF8UF2-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] +; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF8UF2: [[PRED_STORE_IF17]]: +; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 11 ; VF8UF2-NEXT: store i16 0, ptr [[TMP23]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; VF8UF2: [[PRED_STORE_CONTINUE23]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; VF8UF2: [[PRED_STORE_CONTINUE18]]: ; VF8UF2-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 -; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; VF8UF2: [[PRED_STORE_IF24]]: -; VF8UF2-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]] +; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF8UF2: [[PRED_STORE_IF19]]: +; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 12 ; VF8UF2-NEXT: store i16 0, ptr [[TMP25]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; VF8UF2: [[PRED_STORE_CONTINUE25]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; VF8UF2: [[PRED_STORE_CONTINUE20]]: ; VF8UF2-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 -; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; VF8UF2: [[PRED_STORE_IF26]]: -; VF8UF2-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] +; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF8UF2: [[PRED_STORE_IF21]]: +; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 13 ; VF8UF2-NEXT: store i16 0, ptr [[TMP27]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; VF8UF2: [[PRED_STORE_CONTINUE27]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; VF8UF2: [[PRED_STORE_CONTINUE22]]: ; VF8UF2-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 -; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; VF8UF2: [[PRED_STORE_IF28]]: -; VF8UF2-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] +; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF8UF2: [[PRED_STORE_IF23]]: +; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 14 ; VF8UF2-NEXT: store i16 0, ptr [[TMP29]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; VF8UF2: [[PRED_STORE_CONTINUE29]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; VF8UF2: [[PRED_STORE_CONTINUE24]]: ; VF8UF2-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 -; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; VF8UF2: [[PRED_STORE_IF30]]: -; VF8UF2-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] +; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF8UF2: [[PRED_STORE_IF25]]: +; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 15 ; VF8UF2-NEXT: store i16 0, ptr [[TMP31]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; VF8UF2: [[PRED_STORE_CONTINUE31]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; VF8UF2: [[PRED_STORE_CONTINUE26]]: ; VF8UF2-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 -; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; VF8UF2: [[PRED_STORE_IF32]]: -; VF8UF2-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] +; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF8UF2: [[PRED_STORE_IF27]]: +; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 16 ; VF8UF2-NEXT: store i16 0, ptr [[TMP33]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; VF8UF2: [[PRED_STORE_CONTINUE33]]: +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; VF8UF2: [[PRED_STORE_CONTINUE28]]: ; VF8UF2-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 -; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_IF34]]: -; VF8UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] +; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF8UF2: [[PRED_STORE_IF29]]: +; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 17 ; VF8UF2-NEXT: store i16 0, ptr [[TMP35]], align 2 -; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; VF8UF2: [[PRED_STORE_CONTINUE35]]: -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; VF8UF2: [[PRED_STORE_CONTINUE30]]: +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[SCALAR_PH]]: @@ -423,7 +383,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF8UF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: ret void ; @@ -438,146 +398,124 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 ; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ] -; VF16UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]] -; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0 -; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer -; VF16UF1-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], -; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> , [[BROADCAST_SPLAT1]] ; VF16UF1-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0 ; VF16UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF16UF1: [[PRED_STORE_IF]]: -; VF16UF1-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0 -; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]] +; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 2 ; VF16UF1-NEXT: store i16 0, ptr [[TMP4]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE]] ; VF16UF1: [[PRED_STORE_CONTINUE]]: ; VF16UF1-NEXT: [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1 -; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; VF16UF1: [[PRED_STORE_IF3]]: -; VF16UF1-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1 -; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]] +; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; VF16UF1: [[PRED_STORE_IF1]]: +; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 3 ; VF16UF1-NEXT: store i16 0, ptr [[TMP6]], align 2 +; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; VF16UF1: [[PRED_STORE_CONTINUE2]]: +; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 +; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] +; VF16UF1: [[PRED_STORE_IF3]]: +; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 4 +; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; VF16UF1: [[PRED_STORE_CONTINUE4]]: -; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2 -; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 +; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; VF16UF1: [[PRED_STORE_IF5]]: -; VF16UF1-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2 -; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2 +; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 5 +; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; VF16UF1: [[PRED_STORE_CONTINUE6]]: -; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3 -; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 +; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; VF16UF1: [[PRED_STORE_IF7]]: -; VF16UF1-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3 -; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2 +; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 6 +; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; VF16UF1: [[PRED_STORE_CONTINUE8]]: -; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4 -; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 +; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; VF16UF1: [[PRED_STORE_IF9]]: -; VF16UF1-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4 -; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2 +; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 7 +; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; VF16UF1: [[PRED_STORE_CONTINUE10]]: -; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5 -; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 +; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; VF16UF1: [[PRED_STORE_IF11]]: -; VF16UF1-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5 -; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2 +; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 8 +; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; VF16UF1: [[PRED_STORE_CONTINUE12]]: -; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6 -; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 +; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; VF16UF1: [[PRED_STORE_IF13]]: -; VF16UF1-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6 -; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2 +; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 9 +; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; VF16UF1: [[PRED_STORE_CONTINUE14]]: -; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7 -; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 +; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] ; VF16UF1: [[PRED_STORE_IF15]]: -; VF16UF1-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7 -; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2 +; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 10 +; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]] ; VF16UF1: [[PRED_STORE_CONTINUE16]]: -; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8 -; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 +; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] ; VF16UF1: [[PRED_STORE_IF17]]: -; VF16UF1-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8 -; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2 +; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 11 +; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE18]] ; VF16UF1: [[PRED_STORE_CONTINUE18]]: -; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9 -; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 +; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] ; VF16UF1: [[PRED_STORE_IF19]]: -; VF16UF1-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9 -; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2 +; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 12 +; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE20]] ; VF16UF1: [[PRED_STORE_CONTINUE20]]: -; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10 -; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 +; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] ; VF16UF1: [[PRED_STORE_IF21]]: -; VF16UF1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10 -; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2 +; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 13 +; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE22]] ; VF16UF1: [[PRED_STORE_CONTINUE22]]: -; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11 -; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 +; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] ; VF16UF1: [[PRED_STORE_IF23]]: -; VF16UF1-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11 -; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2 +; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 14 +; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE24]] ; VF16UF1: [[PRED_STORE_CONTINUE24]]: -; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12 -; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 +; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] ; VF16UF1: [[PRED_STORE_IF25]]: -; VF16UF1-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12 -; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2 +; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 15 +; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE26]] ; VF16UF1: [[PRED_STORE_CONTINUE26]]: -; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13 -; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 +; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] ; VF16UF1: [[PRED_STORE_IF27]]: -; VF16UF1-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13 -; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2 +; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 16 +; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE28]] ; VF16UF1: [[PRED_STORE_CONTINUE28]]: -; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14 -; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] +; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 +; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] ; VF16UF1: [[PRED_STORE_IF29]]: -; VF16UF1-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14 -; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2 +; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 17 +; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 ; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE30]] ; VF16UF1: [[PRED_STORE_CONTINUE30]]: -; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15 -; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_IF31]]: -; VF16UF1-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15 -; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]] -; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2 -; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; VF16UF1: [[PRED_STORE_CONTINUE32]]: -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF16UF1: [[SCALAR_PH]]: @@ -589,7 +527,7 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; VF16UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: ret void ; @@ -633,7 +571,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VF8UF1: [[MIDDLE_BLOCK]]: ; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -648,7 +586,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP6:![0-9]+]] ; VF8UF1: [[OUTER_LATCH]]: ; VF8UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -669,20 +607,17 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 ; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF8UF2-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 ; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1 ; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1 -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -697,7 +632,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF8UF2-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF8UF2-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF8UF2: [[OUTER_LATCH]]: ; VF8UF2-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF8UF2-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -718,16 +653,13 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 0 ; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] +; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 0 ; VF16UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0 ; VF16UF1-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1 -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]] @@ -742,7 +674,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias ; VF16UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1 ; VF16UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]] ; VF16UF1: [[OUTER_LATCH]]: ; VF16UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1 ; VF16UF1-NEXT: [[C_2:%.*]] = call i1 @cond() @@ -780,28 +712,19 @@ exit: ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ;. ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 930d3cd41d31d9..791c995d88c146 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -499,16 +499,13 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]] -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 0, [[G_64]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 0, [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 -3 ; CHECK-NEXT: store <4 x i16> splat (i16 -1), ptr [[TMP7]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -521,7 +518,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: store i16 [[G_16]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -558,6 +555,5 @@ exit: ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index d70c874499cb74..beb305f23884e6 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -61,7 +61,6 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; ; CHECK: Executing best plan with VF=8, UF=2 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' { -; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: @@ -75,27 +74,21 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: IR %n.vec = sub i64 %and, %n.mod.vf ; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%and> + ir<[[VTC]]> * ir<-1> ; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%A> + ir<[[VTC]]> * ir<1> -; CHECK-NEXT: Successor(s): vector loop +; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> -; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> -; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> -; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> -; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> -; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> -; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> -; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> -; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, ir<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK-NEXT: vector.body: +; CHECK-NEXT: vp<[[STEPS1:%.+]]> = SCALAR-STEPS ir<0>, ir<1> +; CHECK-NEXT: EMIT vp<[[PADD1:%.+]]> = ptradd ir<%A>, vp<[[STEPS1]]> +; CHECK-NEXT: vp<[[VPTR1:%.]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR2:%.]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VPTR1]]> +; CHECK-NEXT: WIDEN ir<%l>.1 = load vp<[[VPTR2]]> +; CHECK-NEXT: WIDEN ir<%add> = add nsw ir<%l>, ir<10> +; CHECK-NEXT: WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10> +; CHECK-NEXT: vp<[[VPTR3:%.+]]> = vector-pointer vp<[[PADD1]]> +; CHECK-NEXT: vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1> +; CHECK-NEXT: WIDEN store vp<[[VPTR3]]>, ir<%add> +; CHECK-NEXT: WIDEN store vp<[[VPTR4]]>, ir<%add>.1 ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: