diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 75ff0fe259a3b0..9fc42e52a1e3e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3016,6 +3016,22 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { PSE.getSE()->forgetLoop(OrigLoop); PSE.getSE()->forgetBlockAndLoopDispositions(); + // When dealing with uncountable early exits we create middle.split blocks + // between the vector loop region and the exit block. These blocks need + // adding to any outer loop. + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); + Loop *OuterLoop = OrigLoop->getParentLoop(); + if (Legal->hasUncountableEarlyExit() && OuterLoop) { + VPBasicBlock *MiddleVPBB = State.Plan->getMiddleBlock(); + VPBlockBase *PredVPBB = MiddleVPBB->getSinglePredecessor(); + while (PredVPBB && PredVPBB != VectorRegion) { + BasicBlock *MiddleSplitBB = + State.CFG.VPBB2IRBB[cast(PredVPBB)]; + OuterLoop->addBasicBlockToLoop(MiddleSplitBB, *LI); + PredVPBB = PredVPBB->getSinglePredecessor(); + } + } + // After vectorization, the exit blocks of the original loop will have // additional predecessors. Invalidate SCEVs for the exit phis in case SE // looked through single-entry phis. @@ -3046,7 +3062,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; @@ -4123,7 +4138,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // a bottom-test and a single exiting block. We'd have to handle the fact // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + if (Legal->hasUncountableEarlyExit() || + TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { @@ -4753,7 +4769,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) + // TODO: Add support for loops with an early exit. + if (Legal->hasUncountableEarlyExit() || + OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; @@ -5001,6 +5019,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; + // We don't attempt to perform interleaving for early exit loops. + if (Legal->hasUncountableEarlyExit()) + return 1; + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); @@ -7813,11 +7835,14 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 When vectorizing the epilogue, fix reduction and induction resume // values from the additional bypass block. if (VectorizingEpilogue) { + assert(!ILV.Legal->hasUncountableEarlyExit() && + "Epilogue vectorisation not yet supported with early exits"); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); for (VPRecipeBase &R : *ExitVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( &R, State, State.CFG.VPBB2IRBB[ExitVPBB], BypassBlock); } + BasicBlock *PH = OrigLoop->getLoopPreheader(); for (const auto &[IVPhi, _] : Legal->getInductionVars()) { auto *Inc = cast(IVPhi->getIncomingValueForBlock(PH)); @@ -10177,13 +10202,33 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { - reportVectorizationFailure("Auto-vectorization of loops with uncountable " - "early exit is not enabled", - "Auto-vectorization of loops with uncountable " - "early exit is not enabled", - "UncountableEarlyExitLoopsDisabled", ORE, L); - return false; + if (LVL.hasUncountableEarlyExit()) { + if (!EnableEarlyExitVectorization) { + reportVectorizationFailure("Auto-vectorization of loops with uncountable " + "early exit is not enabled", + "Auto-vectorization of loops with uncountable " + "early exit is not enabled", + "UncountableEarlyExitLoopsDisabled", ORE, L); + return false; + } + + // Needed to prevent InnerLoopVectorizer::fixupIVUsers from crashing. + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : *BB) { + for (User *U : I.users()) { + Instruction *UI = cast(U); + if (!L->contains(UI)) { + reportVectorizationFailure( + "Auto-vectorization of loops with uncountable " + "early exit and live-outs is not yet supported", + "Auto-vectorization of loop with uncountable " + "early exit and live-outs is not yet supported", + "UncountableEarlyExitLoopLiveOutsUnsupported", ORE, L); + return false; + } + } + } + } } // Entrance to the VPlan-native vectorization path. Outer loops are processed @@ -10208,6 +10253,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (UseInterleaved) IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + if (LVL.hasUncountableEarlyExit()) { + BasicBlock *LoopLatch = L->getLoopLatch(); + if (IAI.requiresScalarEpilogue() || + llvm::any_of(LVL.getCountableExitingBlocks(), + [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { + reportVectorizationFailure("Auto-vectorization of early exit loops " + "requiring a scalar epilogue is unsupported", + "Auto-vectorization of early exit loops " + "requiring a scalar epilogue is unsupported", + "UncountableEarlyExitUnsupported", ORE, L); + return false; + } + } + // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 82556bdd2a5ec1..339875c3b6b7f6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s --check-prefixes=CHECK +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s --check-prefixes=CHECK target triple = "aarch64-unknown-linux-gnu" @@ -272,22 +272,66 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) ; CHECK-NEXT: [[END_CLAMPED:%.*]] = and i32 [[END]], 1023 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[END]] to i10 +; CHECK-NEXT: [[TMP20:%.*]] = zext i10 [[TMP19]] to i64 +; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP20]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = add i8 1, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP16]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] -; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]] +; CHECK-NEXT: br i1 [[CMP_EARLY]], label [[FOUND]], label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[IND_NEXT]] = add i8 [[IND]], 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 ; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: found: ; CHECK-NEXT: ret i32 1 ; CHECK: exit: @@ -325,9 +369,39 @@ exit: ret i32 0 } +%my.struct = type { i8, i8 } + +define i64 @same_exit_block_requires_interleaving() { +entry: + %p1 = alloca [128 x %my.struct] + call void @init_mem(ptr %p1, i64 256) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.latch ], [ 3, %entry ] + %arrayidx = getelementptr inbounds [128 x %my.struct], ptr %p1, i64 0, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %cmp3 = icmp eq i8 %ld1, 3 + br i1 %cmp3, label %loop.latch, label %loop.end + +loop.latch: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 69 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %retval = phi i64 [ 0, %loop.latch ], [ 1, %loop ] + ret i64 %retval +} declare i32 @foo(i32) readonly declare @foo_vec() attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } attributes #1 = { "target-features"="+sve" vscale_range(1,16) } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll index 2a99693523d3cf..64b5df1123c5bf 100644 --- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK-NOT: LV: Not vectorizing +; CHECK-NOT: LV: Auto-vectorization of loops with uncountable early exit and live-outs is not yet supported. entry: %p1 = alloca [1024 x i8] %p2 = alloca [1024 x i8] @@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. +; CHECK: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit and live-outs is not yet supported. entry: %p1 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll index 94af5b7c7607da..0e753a535cd2d3 100644 --- a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s declare void @init_mem(ptr, i64); diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll index 7759c10032e9bd..4027f6a0f5dfdc 100644 --- a/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s declare void @init_mem(ptr, i64); diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 7889191c4b5bae..7f00e77b9169dd 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s declare void @init_mem(ptr, i64); diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll index c68eeac19c9ecf..0808933b9f17f0 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll @@ -1,32 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -enable-early-exit-vectorization \ +; RUN: | FileCheck %s --check-prefix=MAY_FAULT declare void @init_mem(ptr, i64); define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 -; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) -; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: [[P1:%.*]] = alloca [42 x i8], align 1 +; MAY_FAULT-NEXT: [[P2:%.*]] = alloca [42 x i8], align 1 +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; MAY_FAULT-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: %p1 = alloca [42 x i8] @@ -56,25 +57,25 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( -; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs( +; MAY_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: br label %loop @@ -100,25 +101,25 @@ loop.end: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) { -; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( -; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] -; CHECK: loop.inc: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] -; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] -; CHECK-NEXT: ret i64 [[RETVAL]] +; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs( +; MAY_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) { +; MAY_FAULT-NEXT: entry: +; MAY_FAULT-NEXT: br label [[LOOP:%.*]] +; MAY_FAULT: loop: +; MAY_FAULT-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; MAY_FAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; MAY_FAULT-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] +; MAY_FAULT-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; MAY_FAULT-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] +; MAY_FAULT-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; MAY_FAULT: loop.inc: +; MAY_FAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; MAY_FAULT-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 +; MAY_FAULT-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; MAY_FAULT: loop.end: +; MAY_FAULT-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; MAY_FAULT-NEXT: ret i64 [[RETVAL]] ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll new file mode 100644 index 00000000000000..0810639e84719f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll @@ -0,0 +1,87 @@ +; RUN: opt -S < %s -p loop-vectorize,'print' -disable-output -enable-early-exit-vectorization 2>&1 | FileCheck %s + +declare void @init_mem(ptr, i64); + + +define void @early_exit_in_outer_loop1() { +; CHECK-LABEL: Loop info for function 'early_exit_in_outer_loop1': +; CHECK: Loop at depth 1 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop.outer + +loop.outer: + %count = phi i64 [ 0, %entry ], [ %count.next, %loop.inner.end ] + br label %loop.inner + +loop.inner: + %index = phi i64 [ %index.next, %loop.inner.inc ], [ 3, %loop.outer ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inner.inc, label %loop.inner.found + +loop.inner.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop.inner, label %loop.inner.end + +loop.inner.found: + br label %loop.inner.end + +loop.inner.end: + %count.next = phi i64 [ 0, %loop.inner.inc ], [ 1, %loop.inner.found ] + br label %loop.outer +} + +define void @early_exit_in_outer_loop2() { +; CHECK-LABEL: Loop info for function 'early_exit_in_outer_loop2': +; CHECK: Loop at depth 1 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split +; CHECK: Loop at depth 2 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop.outer + +loop.outer: + %count.outer = phi i64 [ 0, %entry ], [ %count.outer.next , %loop.outer.latch ] + br label %loop.middle + +loop.middle: + br label %loop.inner + +loop.inner: + %index = phi i64 [ %index.next, %loop.inner.inc ], [ 3, %loop.middle ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inner.inc, label %loop.inner.found + +loop.inner.inc: + %index.next = add i64 %index, 1 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop.inner, label %loop.inner.end + +loop.inner.end: + br i1 false, label %loop.middle, label %loop.middle.end + +loop.middle.end: + br label %loop.outer.latch + +loop.inner.found: + br label %loop.outer.latch + +loop.outer.latch: + %t = phi i64 [ 0, %loop.middle.end ], [ 1, %loop.inner.found ] + %count.outer.next = add i64 %count.outer, %t + br label %loop.outer +}