Skip to content

Commit

Permalink
[CostModel][X86] Improve add/sub/mul overflow intrinsic costs
Browse files Browse the repository at this point in the history
Summary: Noticed due to x86 changes in #97463

Test Plan: 

Reviewers: 

Subscribers: 

Tasks: 

Tags: 


Differential Revision: https://phabricator.intern.facebook.com/D60250555
  • Loading branch information
RKSimon authored and yuxuanchen1997 committed Jul 25, 2024
1 parent b9815c5 commit b7e5db1
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 253 deletions.
34 changes: 20 additions & 14 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4167,9 +4167,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
{ ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
{ ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
{ ISD::SADDO, MVT::i64, { 1 } },
{ ISD::UADDO, MVT::i64, { 1 } },
{ ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto
{ ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
{ ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
{ ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
{ ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
};
static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
Expand Down Expand Up @@ -4231,15 +4232,18 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
{ ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
{ ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
{ ISD::SADDO, MVT::i32, { 1 } },
{ ISD::SADDO, MVT::i16, { 1 } },
{ ISD::SADDO, MVT::i8, { 1 } },
{ ISD::UADDO, MVT::i32, { 1 } },
{ ISD::UADDO, MVT::i16, { 1 } },
{ ISD::UADDO, MVT::i8, { 1 } },
{ ISD::UMULO, MVT::i32, { 2 } }, // mul + seto
{ ISD::UMULO, MVT::i16, { 2 } },
{ ISD::UMULO, MVT::i8, { 2 } },
{ ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
{ ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
{ ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
{ ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
{ ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
{ ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
{ ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
{ ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
{ ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
{ ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
{ ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
{ ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
};

Type *RetTy = ICA.getReturnType();
Expand Down Expand Up @@ -4352,9 +4356,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
ISD = ISD::UADDO;
OpTy = RetTy->getContainedType(0);
break;
case Intrinsic::umul_with_overflow:
case Intrinsic::smul_with_overflow:
// SMULO has same costs so don't duplicate.
ISD = ISD::SMULO;
OpTy = RetTy->getContainedType(0);
break;
case Intrinsic::umul_with_overflow:
ISD = ISD::UMULO;
OpTy = RetTy->getContainedType(0);
break;
Expand Down
460 changes: 230 additions & 230 deletions llvm/test/Analysis/CostModel/X86/arith-overflow.ll

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/X86/costmodel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ define i64 @foo(i64 %arg) {
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %I2P = inttoptr i64 undef to ptr
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %P2I = ptrtoint ptr undef to i64
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %TC = trunc i64 undef to i32
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void undef()
; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 undef
;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,22 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)

define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
; THRU-LABEL: 'umul'
; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; THRU-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; THRU-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; LATE-LABEL: 'umul'
; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; LATE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'umul'
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'umul'
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 30
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 54
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG2]], i64 8
Expand Down Expand Up @@ -379,7 +379,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N]], 3
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 28
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 52
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 3
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ define i32 @main(ptr %ptr) {
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], [[UMIN1]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 32
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP4]], 40
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[CONV3]], -1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1
; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]]
; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 28
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 60
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
Expand Down

0 comments on commit b7e5db1

Please sign in to comment.