-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Delete the redundant overflow check for DataAndControlFlowWithoutRuntimeCheck #79825
base: main
Are you sure you want to change the base?
Conversation
…houtRuntimeCheck Fix llvm#71917
@llvm/pr-subscribers-llvm-transforms Author: Allen (vfdff) ChangesFix #71917 Patch is 454.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79825.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5d7ac2bf9bb05f1..7833bc06e04c9c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1107,21 +1107,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Create the ActiveLaneMask instruction using the correct start values.
VPValue *TC = Plan.getTripCount();
- VPValue *TripCount, *IncrementValue;
- if (!DataAndControlFlowWithoutRuntimeCheck) {
- // When the loop is guarded by a runtime overflow check for the loop
- // induction variable increment by VF, we can increment the value before
- // the get.active.lane mask and use the unmodified tripcount.
- IncrementValue = CanonicalIVIncrement;
- TripCount = TC;
- } else {
- // When avoiding a runtime check, the active.lane.mask inside the loop
- // uses a modified trip count and the induction variable increment is
- // done after the active.lane.mask intrinsic is called.
- IncrementValue = CanonicalIVPHI;
- TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
- {TC}, DL);
- }
auto *EntryIncrement = Builder.createOverflowingOp(
VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
"index.part.next");
@@ -1140,12 +1125,18 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// original terminator.
VPRecipeBase *OriginalTerminator = EB->getTerminator();
Builder.setInsertPoint(OriginalTerminator);
- auto *InLoopIncrement =
- Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
- {IncrementValue}, {false, false}, DL);
- auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
- {InLoopIncrement, TripCount}, DL,
- "active.lane.mask.next");
+
+ auto *InLoopIncrement = CanonicalIVIncrement;
+ // When the loop is guarded by a runtime overflow check for the loop induction
+ // variable increment by VF, we can increment the value before the
+ // get.active.lane mask and use the unmodified tripcount.
+ if (!DataAndControlFlowWithoutRuntimeCheck)
+ InLoopIncrement =
+ Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
+ {InLoopIncrement}, {false, false}, DL);
+ auto *ALM =
+ Builder.createNaryOp(VPInstruction::ActiveLaneMask, {InLoopIncrement, TC},
+ DL, "active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);
// Replace the original terminator with BranchOnCond. We have to invert the
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
index 7bb25b8b3ea304a..b551423ebee12f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
@@ -4,9 +4,6 @@
define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n) {
; CHECK-LABEL: @invariant_store_red_exit_is_phi(
; CHECK: vector.ph:
-; CHECK: %[[N_MINUS_VF:.*]] = sub i64 %n, %[[VSCALE_X_4:.*]]
-; CHECK: %[[CMP:.*]] = icmp ugt i64 %n, %[[VSCALE_X_4]]
-; CHECK: %[[N2:.*]] = select i1 %[[CMP]], i64 %[[N_MINUS_VF]], i64 0
; CHECK: %[[ACTIVE_LANE_MASK_ENTRY:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
; CHECK: vector.body:
; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1> [ %[[ACTIVE_LANE_MASK_ENTRY]], %vector.ph ], [ %[[ACTIVE_LANE_MASK_NEXT:.*]], %vector.body ]
@@ -14,7 +11,8 @@ define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n
; CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0
; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
-; CHECK: %[[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %[[N2]])
+; CHECK-NEXT: %[[INDEX_NEXT:.*]] = add i64 %index, %{{.*}}
+; CHECK: %[[ACTIVE_LANE_MASK_NEXT:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %[[INDEX_NEXT]], i64 %n)
; CHECK: middle.block:
; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
; CHECK-NEXT: store i32 %[[SUM]], ptr %dst, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index fc67fb5aded6a8d..d14f4794b836570 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -42,22 +42,22 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-UNORDERED: vector.body:
; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP7]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP7]])
+; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP9]])
; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-UNORDERED: scalar.ph:
@@ -89,18 +89,18 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED: vector.body:
; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
-; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-ORDERED-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
+; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-ORDERED: middle.block:
@@ -108,7 +108,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-ORDERED: scalar.ph:
; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-ORDERED: for.body:
; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -120,7 +120,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
;
; CHECK-ORDERED-TF-LABEL: define float @fadd_strict
@@ -136,47 +136,42 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8
; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
-; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP13]])
-; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement <vscale x 8 x i1> [[TMP17]], i32 0
-; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP9]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP10]])
+; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
+; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement <vscale x 8 x i1> [[TMP12]], i32 0
+; CHECK-ORDERED-TF-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-ORDERED-TF: scalar.ph:
; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-ORDERED-TF: for.body:
; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP19]], [[SUM_07]]
+; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP14]], [[SUM_07]]
; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-ORDERED-TF: for.end:
-; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]]
;
@@ -230,60 +225,60 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32
+; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-UNORDERED: vector.body:
; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x...
[truncated]
|
ping ? |
ping ? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a number of test changes that seem to only adjust the naming in the patterns, can those be undone?
Could you also expand the description a bit, currently it is not clear to me why this change is legal in all cases (see inline comment). Would also be good to explain without need to refer SVE specific instructions.
"active.lane.mask.next"); | ||
|
||
auto *InLoopIncrement = CanonicalIVIncrement; | ||
// When the loop is guarded by a runtime overflow check for the loop induction |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I might be missing something, but if DataAndControlFlowWithoutRuntimeCheck
there is no runtime check to make sure iv + UFxVF
doesn't overflow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hi @fhahn
Do you mean use -mllvm -force-tail-folding-style=data-and-control-without-rt-check to enable DataAndControlFlowWithoutRuntimeCheck ? I don't find any change with the option, see https://gcc.godbolt.org/z/M878YbK8d
Now we bring in the runtime check of overflow to make sure num-vf is not a negative number. so if we place the update of iv before the while i< num-vf as gcc, then the loop bound num doesn't need subtract VF in advance.
|
Don't need to subtract VF in advance if you put the iv update before whilelo.
Fix #78960