Skip to content

Commit

Permalink
[SDAG] Don't treat ISD::SHL as a uniform binary operator in `ShrinkDe…
Browse files Browse the repository at this point in the history
…mandedOp` (#92753)

In `TargetLowering::ShrinkDemandedOp`, types of lhs and rhs may differ
before legalization.
In the original case, `VT` is `i64` and `SmallVT` is `i32`, but the type
of rhs is `i8`. Then invalid truncate nodes will be created.

See the description of ISD::SHL for further information:
> After legalization, the type of the shift amount is known to be
TLI.getShiftAmountTy(). Before legalization, the shift amount can be any
type, but care must be taken to ensure it is large enough.


https://github.com/llvm/llvm-project/blob/605ae4e93be8976095c7eedf5c08bfdb9ff71257/llvm/include/llvm/CodeGen/ISDOpcodes.h#L691-L712

This patch stops handling ISD::SHL in `TargetLowering::ShrinkDemandedOp`
and duplicates the logic in `TargetLowering::SimplifyDemandedBits`.
Additionally, it adds some additional checks like
`isNarrowingProfitable` and `isTypeDesirableForOp` to improve the
codegen on AArch64.

Fixes #92720.
  • Loading branch information
dtcxzyw authored May 22, 2024
1 parent ba0e871 commit cf12830
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 71 deletions.
32 changes: 29 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
if (VT.isVector())
return false;

assert(Op.getOperand(0).getValueType().getScalarSizeInBits() == BitWidth &&
Op.getOperand(1).getValueType().getScalarSizeInBits() == BitWidth &&
"ShrinkDemandedOp only supports operands that have the same size!");

// Don't do this if the node has another user, which may require the
// full value.
if (!Op.getNode()->hasOneUse())
Expand Down Expand Up @@ -1832,11 +1836,33 @@ bool TargetLowering::SimplifyDemandedBits(
}
}

// TODO: Can we merge this fold with the one below?
// Try shrinking the operation as long as the shift amount will still be
// in range.
if ((ShAmt < DemandedBits.getActiveBits()) &&
ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
return true;
if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&
Op.getNode()->hasOneUse()) {
// Search for the smallest integer type with free casts to and from
// Op's type. For expedience, just check power-of-2 integer types.
unsigned DemandedSize = DemandedBits.getActiveBits();
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
if (isNarrowingProfitable(VT, SmallVT) &&
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
assert(DemandedSize <= SmallVTBits &&
"Narrowed below demanded bits?");
// We found a type with free casts.
SDValue NarrowShl = TLO.DAG.getNode(
ISD::SHL, dl, SmallVT,
TLO.DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
TLO.DAG.getShiftAmountConstant(ShAmt, SmallVT, dl));
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
}
}
}

// Narrow shift to lower half - similar to ShrinkDemandedOp.
// (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
Expand Down
11 changes: 4 additions & 7 deletions llvm/test/CodeGen/AArch64/bitfield-insert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,10 @@ define void @test_64bit_badmask(ptr %existing, ptr %new) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x8, [x0]
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: mov w10, #135 // =0x87
; CHECK-NEXT: mov w11, #664 // =0x298
; CHECK-NEXT: lsl w9, w9, #3
; CHECK-NEXT: and x8, x8, x10
; CHECK-NEXT: and x9, x9, x11
; CHECK-NEXT: mov w10, #664 // =0x298
; CHECK-NEXT: mov w11, #135 // =0x87
; CHECK-NEXT: and x9, x10, x9, lsl #3
; CHECK-NEXT: and x8, x8, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: str x8, [x0]
; CHECK-NEXT: ret
Expand Down Expand Up @@ -579,7 +578,6 @@ define <2 x i32> @test_complex_type(ptr %addr, i64 %in, ptr %bf ) {
define i64 @test_truncated_shift(i64 %x, i64 %y) {
; CHECK-LABEL: test_truncated_shift:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w1 killed $w1 killed $x1 def $x1
; CHECK-NEXT: bfi x0, x1, #25, #5
; CHECK-NEXT: ret
entry:
Expand All @@ -593,7 +591,6 @@ entry:
define i64 @test_and_extended_shift_with_imm(i64 %0) {
; CHECK-LABEL: test_and_extended_shift_with_imm:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0
; CHECK-NEXT: ubfiz x0, x0, #7, #8
; CHECK-NEXT: ret
%2 = shl i64 %0, 7
Expand Down
118 changes: 57 additions & 61 deletions llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -571,29 +571,27 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp x10, x9, [x0]
; CHECK-NEXT: ldrb w13, [x0, #18]
; CHECK-NEXT: ldrh w14, [x0, #16]
; CHECK-NEXT: ldp x9, x10, [x0]
; CHECK-NEXT: ldrb w14, [x0, #18]
; CHECK-NEXT: ldrh w15, [x0, #16]
; CHECK-NEXT: add x0, x0, #32
; CHECK-NEXT: ubfx x12, x9, #12, #20
; CHECK-NEXT: fmov s0, w10
; CHECK-NEXT: lsr x11, x10, #19
; CHECK-NEXT: lsr x15, x9, #31
; CHECK-NEXT: fmov s1, w12
; CHECK-NEXT: lsr x12, x9, #50
; CHECK-NEXT: mov.s v0[1], w11
; CHECK-NEXT: orr w11, w14, w13, lsl #16
; CHECK-NEXT: lsr x13, x10, #38
; CHECK-NEXT: lsr x10, x10, #57
; CHECK-NEXT: mov.s v1[1], w15
; CHECK-NEXT: orr w12, w12, w11, lsl #14
; CHECK-NEXT: orr w9, w10, w9, lsl #7
; CHECK-NEXT: lsr w10, w11, #5
; CHECK-NEXT: mov.s v0[2], w13
; CHECK-NEXT: ubfx x12, x10, #12, #20
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: lsr x11, x9, #19
; CHECK-NEXT: lsr x13, x10, #31
; CHECK-NEXT: fmov s0, w12
; CHECK-NEXT: lsr x12, x9, #38
; CHECK-NEXT: extr x9, x10, x9, #57
; CHECK-NEXT: mov.s v1[1], w11
; CHECK-NEXT: orr x11, x15, x14, lsl #16
; CHECK-NEXT: mov.s v0[1], w13
; CHECK-NEXT: extr x13, x11, x10, #50
; CHECK-NEXT: ubfx x10, x11, #5, #27
; CHECK-NEXT: mov.s v1[2], w12
; CHECK-NEXT: mov.s v0[3], w9
; CHECK-NEXT: mov.s v1[3], w10
; CHECK-NEXT: uzp1.8h v0, v0, v1
; CHECK-NEXT: mov.s v0[2], w13
; CHECK-NEXT: mov.s v1[3], w9
; CHECK-NEXT: mov.s v0[3], w10
; CHECK-NEXT: uzp1.8h v0, v1, v0
; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str d0, [x1, x8, lsl #3]
; CHECK-NEXT: add x8, x8, #1
Expand All @@ -608,35 +606,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: .LBB5_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ldp x10, x9, [x0]
; CHECK-BE-NEXT: ldrb w16, [x0, #18]
; CHECK-BE-NEXT: lsr x11, x9, #40
; CHECK-BE-NEXT: ubfx x12, x9, #33, #7
; CHECK-BE-NEXT: lsr x15, x10, #45
; CHECK-BE-NEXT: lsr x13, x10, #40
; CHECK-BE-NEXT: ubfx x14, x10, #26, #14
; CHECK-BE-NEXT: orr w11, w12, w11, lsl #7
; CHECK-BE-NEXT: ldrh w12, [x0, #16]
; CHECK-BE-NEXT: fmov s0, w15
; CHECK-BE-NEXT: orr w13, w14, w13, lsl #14
; CHECK-BE-NEXT: ubfx x14, x9, #14, #18
; CHECK-BE-NEXT: ldrh w16, [x0, #16]
; CHECK-BE-NEXT: ldrb w17, [x0, #18]
; CHECK-BE-NEXT: add x0, x0, #32
; CHECK-BE-NEXT: fmov s1, w11
; CHECK-BE-NEXT: orr w11, w16, w12, lsl #8
; CHECK-BE-NEXT: lsl x12, x9, #24
; CHECK-BE-NEXT: mov v0.s[1], w13
; CHECK-BE-NEXT: lsl x11, x9, #24
; CHECK-BE-NEXT: lsr x12, x9, #40
; CHECK-BE-NEXT: lsr x13, x10, #45
; CHECK-BE-NEXT: lsl x14, x10, #24
; CHECK-BE-NEXT: lsr x15, x10, #40
; CHECK-BE-NEXT: extr x12, x12, x11, #57
; CHECK-BE-NEXT: fmov s0, w13
; CHECK-BE-NEXT: ubfx x13, x10, #7, #25
; CHECK-BE-NEXT: extr x14, x15, x14, #50
; CHECK-BE-NEXT: ubfx x15, x9, #14, #18
; CHECK-BE-NEXT: extr x9, x10, x9, #40
; CHECK-BE-NEXT: orr w12, w11, w12
; CHECK-BE-NEXT: mov v1.s[1], w14
; CHECK-BE-NEXT: lsr w12, w12, #19
; CHECK-BE-NEXT: fmov s1, w12
; CHECK-BE-NEXT: orr w12, w17, w16, lsl #8
; CHECK-BE-NEXT: mov v0.s[1], w14
; CHECK-BE-NEXT: ubfx x9, x9, #12, #20
; CHECK-BE-NEXT: orr w11, w12, w11
; CHECK-BE-NEXT: mov v1.s[1], w15
; CHECK-BE-NEXT: lsr w11, w11, #19
; CHECK-BE-NEXT: mov v0.s[2], w13
; CHECK-BE-NEXT: mov v1.s[2], w12
; CHECK-BE-NEXT: mov v1.s[2], w11
; CHECK-BE-NEXT: mov v0.s[3], w9
; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
; CHECK-BE-NEXT: add x8, x8, #1
; CHECK-BE-NEXT: cmp x8, #1000
; CHECK-BE-NEXT: mov v1.s[3], w11
; CHECK-BE-NEXT: mov v1.s[3], w12
; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-BE-NEXT: xtn v0.8b, v0.8h
; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
Expand All @@ -650,35 +647,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: .LBB5_1: // %loop
; CHECK-DISABLE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-DISABLE-NEXT: ldp x10, x9, [x0]
; CHECK-DISABLE-NEXT: ldrb w16, [x0, #18]
; CHECK-DISABLE-NEXT: lsr x11, x9, #40
; CHECK-DISABLE-NEXT: ubfx x12, x9, #33, #7
; CHECK-DISABLE-NEXT: lsr x15, x10, #45
; CHECK-DISABLE-NEXT: lsr x13, x10, #40
; CHECK-DISABLE-NEXT: ubfx x14, x10, #26, #14
; CHECK-DISABLE-NEXT: orr w11, w12, w11, lsl #7
; CHECK-DISABLE-NEXT: ldrh w12, [x0, #16]
; CHECK-DISABLE-NEXT: fmov s0, w15
; CHECK-DISABLE-NEXT: orr w13, w14, w13, lsl #14
; CHECK-DISABLE-NEXT: ubfx x14, x9, #14, #18
; CHECK-DISABLE-NEXT: ldrh w16, [x0, #16]
; CHECK-DISABLE-NEXT: ldrb w17, [x0, #18]
; CHECK-DISABLE-NEXT: add x0, x0, #32
; CHECK-DISABLE-NEXT: fmov s1, w11
; CHECK-DISABLE-NEXT: orr w11, w16, w12, lsl #8
; CHECK-DISABLE-NEXT: lsl x12, x9, #24
; CHECK-DISABLE-NEXT: mov v0.s[1], w13
; CHECK-DISABLE-NEXT: lsl x11, x9, #24
; CHECK-DISABLE-NEXT: lsr x12, x9, #40
; CHECK-DISABLE-NEXT: lsr x13, x10, #45
; CHECK-DISABLE-NEXT: lsl x14, x10, #24
; CHECK-DISABLE-NEXT: lsr x15, x10, #40
; CHECK-DISABLE-NEXT: extr x12, x12, x11, #57
; CHECK-DISABLE-NEXT: fmov s0, w13
; CHECK-DISABLE-NEXT: ubfx x13, x10, #7, #25
; CHECK-DISABLE-NEXT: extr x14, x15, x14, #50
; CHECK-DISABLE-NEXT: ubfx x15, x9, #14, #18
; CHECK-DISABLE-NEXT: extr x9, x10, x9, #40
; CHECK-DISABLE-NEXT: orr w12, w11, w12
; CHECK-DISABLE-NEXT: mov v1.s[1], w14
; CHECK-DISABLE-NEXT: lsr w12, w12, #19
; CHECK-DISABLE-NEXT: fmov s1, w12
; CHECK-DISABLE-NEXT: orr w12, w17, w16, lsl #8
; CHECK-DISABLE-NEXT: mov v0.s[1], w14
; CHECK-DISABLE-NEXT: ubfx x9, x9, #12, #20
; CHECK-DISABLE-NEXT: orr w11, w12, w11
; CHECK-DISABLE-NEXT: mov v1.s[1], w15
; CHECK-DISABLE-NEXT: lsr w11, w11, #19
; CHECK-DISABLE-NEXT: mov v0.s[2], w13
; CHECK-DISABLE-NEXT: mov v1.s[2], w12
; CHECK-DISABLE-NEXT: mov v1.s[2], w11
; CHECK-DISABLE-NEXT: mov v0.s[3], w9
; CHECK-DISABLE-NEXT: add x9, x1, x8, lsl #3
; CHECK-DISABLE-NEXT: add x8, x8, #1
; CHECK-DISABLE-NEXT: cmp x8, #1000
; CHECK-DISABLE-NEXT: mov v1.s[3], w11
; CHECK-DISABLE-NEXT: mov v1.s[3], w12
; CHECK-DISABLE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-DISABLE-NEXT: xtn v0.8b, v0.8h
; CHECK-DISABLE-NEXT: st1 { v0.8b }, [x9]
Expand Down
15 changes: 15 additions & 0 deletions llvm/test/CodeGen/X86/pr92720.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s

; Make sure we don't crash when shrinking the shift amount before legalization.
define i64 @pr92720(i64 %x) {
; CHECK-LABEL: pr92720:
; CHECK: # %bb.0:
; CHECK-NEXT: movabsq $8589934592, %rax # imm = 0x200000000
; CHECK-NEXT: retq
%or = or i64 %x, 255
%sub = sub i64 0, %or
%shl = shl i64 1, %sub
%sext = shl i64 %shl, 32
ret i64 %sext
}

0 comments on commit cf12830

Please sign in to comment.