Skip to content

Commit

Permalink
[AIE2] Shuffle pattern for reversing vector order
Browse files Browse the repository at this point in the history
  • Loading branch information
ValentijnvdBeek committed Jun 11, 2024
1 parent 3cbef8d commit 72a0a03
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 89 deletions.
51 changes: 45 additions & 6 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,13 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;

// type helper
using generator = std::function<std::optional<int32_t>()>;

// {1, 2, ..., n} -> G_CONCAT_VECTOR
// Turns a shuffle vector that only increments into a concat vector
// instruction
std::function<std::optional<int32_t>()> CountUp =
adderGenerator(0, DstNumElts - 1, 1);
generator CountUp = adderGenerator(0, DstNumElts - 1, 1);
if (matchCombineShuffleVector(MI, Ops, CountUp, 2 * SrcNumElts)) {
applyCombineShuffleVector(MI, Ops);
return true;
Expand All @@ -394,8 +396,7 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
// {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES
// Extracts the first chunk of the same size of the destination vector from
// the source
std::function<std::optional<int32_t>()> FirstQuarter =
adderGenerator(0, DstNumElts - 1, 1);
generator FirstQuarter = adderGenerator(0, DstNumElts - 1, 1);
if (matchCombineShuffleVectorSimple(MI, FirstQuarter, DstNumElts - 1)) {
if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0)
return false;
Expand All @@ -407,15 +408,53 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
// {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES
// Extracts the second chunk of the same size of the destination vector from
// the source
std::function<std::optional<int32_t>()> SecondQuarter =
adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1);
generator SecondQuarter = adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1);
if (matchCombineShuffleVectorSimple(MI, SecondQuarter, DstNumElts - 1)) {
if (((SrcNumElts / 2) % 2) != 0)
return false;
createUnmergeValue(MI, MI.getOperand(1).getReg(), DstReg, 1);
MI.eraseFromParent();
return true;
}

// {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES
// Take the first halfs of the two vectors and concatenate them into one
// vector.
generator FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1);
generator FirstEightB =
adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1);

generator FirstAndThird =
concatGenerators(SmallVector<generator>{FirstEightA, FirstEightB});
if (matchCombineShuffleVectorSimple(MI, FirstAndThird,
(DstNumElts / 2) - 1)) {
if (DstNumElts <= 2)
return false;
const Register DstReg = MI.getOperand(0).getReg();
const LLT HalfSrcTy =
LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType());
const Register HalfOfA =
createUnmergeValue(MI, MI.getOperand(1).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
const Register HalfOfB =
createUnmergeValue(MI, MI.getOperand(2).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB});
MI.eraseFromParent();
return true;
}

// {n/2, n/2+1, ..., n, 0, 1, ..., n/2-1}
generator FirstHalf = adderGenerator(0, SrcNumElts / 2, 1);
generator SecondHalf = adderGenerator(SrcNumElts / 2, SrcNumElts, 1);
generator Reverse =
concatGenerators(SmallVector<generator>{FirstHalf, SecondHalf});

if (matchCombineShuffleVectorSimple(MI, Reverse, SrcNumElts)) {
applyCombineShuffleVector(MI, {Ops[1], Ops[0]});
return true;
}

return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,78 @@ body: |
%1:_(<128 x s8>) = COPY $y2
%2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7)
PseudoRET implicit $lr, implicit %2
...

---
name: insert_vector_16_elements
legalized: false
body: |
bb.1.entry:
liveins: $x0, $x1
; CHECK-LABEL: name: insert_vector_16_elements
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>)
%1:_(<16 x s32>) = COPY $x0
%2:_(<16 x s32>) = COPY $x1
%3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
PseudoRET implicit $lr, implicit %3
...

---
name: insert_vector_8_elements
legalized: false
body: |
bb.1.entry:
liveins: $wl0, $wl1
; CHECK-LABEL: name: insert_vector_8_elements
; CHECK: liveins: $wl0, $wl1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1
; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>)
; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>)
%1:_(<8 x s32>) = COPY $wl0
%2:_(<8 x s32>) = COPY $wl1
%3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11)
PseudoRET implicit $lr, implicit %3
...

---
name: insert_vector_128_elements
legalized: false
body: |
bb.1.entry:
liveins: $y2, $y3
; CHECK-LABEL: name: insert_vector_128_elements
; CHECK: liveins: $y2, $y3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>)
%1:_(<128 x s8>) = COPY $y2
%2:_(<128 x s8>) = COPY $y3
%3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191)
PseudoRET implicit $lr, implicit %3
---
name: concat_vector_reverse_32_512
legalized: false
body: |
bb.1.entry:
liveins: $wl2, $wl4
%1:_(<8 x s32>) = COPY $wl2
%2:_(<8 x s32>) = COPY $wl4
%0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)
$x0 = COPY %0:_(<16 x s32>)
PseudoRET implicit $lr, implicit $x0
...
97 changes: 14 additions & 83 deletions llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,95 +49,26 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
; CHECK-NEXT: nopx // Delay Slot 5
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: mov r24, r16 // Delay Slot 2
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %if.end
; CHECK-NEXT: vextract.s32 r0, x2, r16
; CHECK-NEXT: vextract.s32 r1, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #1
; CHECK-NEXT: vextract.s32 r2, x2, r16
; CHECK-NEXT: vextract.s32 r3, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #2
; CHECK-NEXT: vextract.s32 r4, x2, r16
; CHECK-NEXT: vextract.s32 r5, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #3
; CHECK-NEXT: vextract.s32 r6, x2, r16
; CHECK-NEXT: vextract.s32 r7, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #4
; CHECK-NEXT: vextract.s32 r8, x2, r16
; CHECK-NEXT: vextract.s32 r9, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #5
; CHECK-NEXT: vextract.s32 r10, x2, r16
; CHECK-NEXT: vextract.s32 r11, x4, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #7
; CHECK-NEXT: vextract.s32 r12, x2, r16
; CHECK-NEXT: j #.LBB1_3
; CHECK-NEXT: vextract.s32 r13, x4, r16 // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: mova r16, #6 // Delay Slot 3
; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 2
; CHECK-NEXT: vextract.s32 r15, x4, r16 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
; CHECK-NEXT: nopx // Delay Slot 5
; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: vmov x0, x2 // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: // %if.then
; CHECK-NEXT: nopa ; nopx ; vextract.s32 r0, x4, r16
; CHECK-NEXT: vextract.s32 r1, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #1
; CHECK-NEXT: vextract.s32 r2, x4, r16
; CHECK-NEXT: vextract.s32 r3, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #2
; CHECK-NEXT: vextract.s32 r4, x4, r16
; CHECK-NEXT: vextract.s32 r5, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #3
; CHECK-NEXT: vextract.s32 r6, x4, r16
; CHECK-NEXT: vextract.s32 r7, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #4
; CHECK-NEXT: vextract.s32 r8, x4, r16
; CHECK-NEXT: vextract.s32 r9, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #5
; CHECK-NEXT: vextract.s32 r10, x4, r16
; CHECK-NEXT: vextract.s32 r11, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #7
; CHECK-NEXT: vextract.s32 r12, x4, r16
; CHECK-NEXT: vextract.s32 r13, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #6
; CHECK-NEXT: vextract.s32 r14, x4, r16
; CHECK-NEXT: vextract.s32 r15, x2, r16
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_3: // %cleanup
; CHECK-NEXT: nopa ; nopb ; nopx ; vpush.lo.32 x0, r13, x0
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
; CHECK-NEXT: vpush.lo.32 x0, r3, x0
; CHECK-NEXT: vpush.lo.32 x0, r1, x0
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
; CHECK-NEXT: ret lr
; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5
; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%cmp = icmp eq i32 %idx, 0
Expand Down

0 comments on commit 72a0a03

Please sign in to comment.