Skip to content

Commit

Permalink
[AIE2] Add a pattern that combines the first halfs of two vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
ValentijnvdBeek committed May 22, 2024
1 parent 4126ae9 commit 0417b1c
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 89 deletions.
29 changes: 29 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,35 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
MI.eraseFromParent();
return true;
}

// {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES
// Take the first halfs of the two vectors and concatenate them into one
// vector.
std::function<std::optional<int32_t>()> FirstEightA =
adderGenerator(0, (DstNumElts / 2) - 1, 1);
std::function<std::optional<int32_t>()> FirstEightB =
adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1);

std::function<std::optional<int32_t>()> FirstAndThird =
concatGenerators(SmallVector<std::function<std::optional<int32_t>()>>{
FirstEightA, FirstEightB});
if (matchCombineShuffleVectorSimple(MI, FirstAndThird,
(DstNumElts / 2) - 1)) {
if (DstNumElts <= 2)
return false;
const Register DstReg = MI.getOperand(0).getReg();
const LLT HalfSrcTy =
LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType());
const Register HalfOfA =
createUnmergeValue(MI, MI.getOperand(1).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
const Register HalfOfB =
createUnmergeValue(MI, MI.getOperand(2).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0);
Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB});
MI.eraseFromParent();
return true;
}
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,66 @@ body: |
%1:_(<128 x s8>) = COPY $y2
%2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7)
PseudoRET implicit $lr, implicit %2
...

---
name: insert_vector_16_elements
legalized: false
body: |
bb.1.entry:
liveins: $x0, $x1
; CHECK-LABEL: name: insert_vector_16_elements
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>)
%1:_(<16 x s32>) = COPY $x0
%2:_(<16 x s32>) = COPY $x1
%3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23)
PseudoRET implicit $lr, implicit %3
...

---
name: insert_vector_8_elements
legalized: false
body: |
bb.1.entry:
liveins: $wl0, $wl1
; CHECK-LABEL: name: insert_vector_8_elements
; CHECK: liveins: $wl0, $wl1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1
; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>)
; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>)
%1:_(<8 x s32>) = COPY $wl0
%2:_(<8 x s32>) = COPY $wl1
%3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11)
PseudoRET implicit $lr, implicit %3
...

---
name: insert_vector_128_elements
legalized: false
body: |
bb.1.entry:
liveins: $y2, $y3
; CHECK-LABEL: name: insert_vector_128_elements
; CHECK: liveins: $y2, $y3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>)
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>)
%1:_(<128 x s8>) = COPY $y2
%2:_(<128 x s8>) = COPY $y3
%3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191)
PseudoRET implicit $lr, implicit %3
105 changes: 16 additions & 89 deletions llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,100 +48,27 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <
; CHECK-LABEL: test_insert_vector:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: nopa ; nopb ; jz r0, #.LBB1_2
; CHECK-NEXT: mov r24, r16 // Delay Slot 5
; CHECK-NEXT: mov r25, r17 // Delay Slot 4
; CHECK-NEXT: mov r26, r18 // Delay Slot 3
; CHECK-NEXT: mov r27, r19 // Delay Slot 2
; CHECK-NEXT: mova r16, #0 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %if.end
; CHECK-NEXT: nopx ; vextract.s32 r0, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #1
; CHECK-NEXT: mova r17, #5
; CHECK-NEXT: mova r19, #6
; CHECK-NEXT: mova r18, #0
; CHECK-NEXT: vextract.s32 r1, x2, r16
; CHECK-NEXT: vextract.s32 r8, x4, r18
; CHECK-NEXT: movx r16, #2
; CHECK-NEXT: mova r18, #1
; CHECK-NEXT: vextract.s32 r5, x2, r17
; CHECK-NEXT: vextract.s32 r6, x2, r19
; CHECK-NEXT: vextract.s32 r13, x4, r17
; CHECK-NEXT: vextract.s32 r15, x4, r19
; CHECK-NEXT: vextract.s32 r2, x2, r16
; CHECK-NEXT: vextract.s32 r9, x4, r18
; CHECK-NEXT: movx r16, #3
; CHECK-NEXT: mova r18, #2
; CHECK-NEXT: vextract.s32 r10, x4, r18
; CHECK-NEXT: vextract.s32 r3, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #4
; CHECK-NEXT: vextract.s32 r4, x2, r16
; CHECK-NEXT: movx r18, #3
; CHECK-NEXT: mova r16, #7
; CHECK-NEXT: vextract.s32 r11, x4, r18
; CHECK-NEXT: j #.LBB1_3
; CHECK-NEXT: mova r18, #4 // Delay Slot 5
; CHECK-NEXT: vextract.s32 r7, x2, r16 // Delay Slot 4
; CHECK-NEXT: vextract.s32 r12, x4, r18 // Delay Slot 3
; CHECK-NEXT: vextract.s32 r14, x4, r16 // Delay Slot 2
; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv
; CHECK-NEXT: nopx // Delay Slot 5
; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: vmov x0, x2 // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: // %if.then
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv
; CHECK-NEXT: nop
; CHECK-NEXT: mova r16, #1
; CHECK-NEXT: mova r17, #5
; CHECK-NEXT: mova r19, #6
; CHECK-NEXT: mova r18, #0
; CHECK-NEXT: vextract.s32 r1, x4, r16
; CHECK-NEXT: vextract.s32 r8, x2, r18
; CHECK-NEXT: movx r16, #2
; CHECK-NEXT: mova r18, #1
; CHECK-NEXT: vextract.s32 r5, x4, r17
; CHECK-NEXT: vextract.s32 r6, x4, r19
; CHECK-NEXT: vextract.s32 r13, x2, r17
; CHECK-NEXT: vextract.s32 r15, x2, r19
; CHECK-NEXT: vextract.s32 r2, x4, r16
; CHECK-NEXT: vextract.s32 r9, x2, r18
; CHECK-NEXT: movx r16, #3
; CHECK-NEXT: mova r18, #2
; CHECK-NEXT: vextract.s32 r3, x4, r16
; CHECK-NEXT: vextract.s32 r10, x2, r18
; CHECK-NEXT: movx r16, #4
; CHECK-NEXT: mova r18, #3
; CHECK-NEXT: vextract.s32 r4, x4, r16
; CHECK-NEXT: vextract.s32 r11, x2, r18
; CHECK-NEXT: movx r16, #7
; CHECK-NEXT: mova r18, #4
; CHECK-NEXT: vextract.s32 r7, x4, r16
; CHECK-NEXT: vextract.s32 r12, x2, r18
; CHECK-NEXT: vextract.s32 r14, x2, r16
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_3: // %cleanup
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv
; CHECK-NEXT: mov r18, r26
; CHECK-NEXT: mov r17, r25
; CHECK-NEXT: vpush.lo.32 x0, r14, x0
; CHECK-NEXT: vpush.lo.32 x0, r15, x0
; CHECK-NEXT: vpush.lo.32 x0, r13, x0
; CHECK-NEXT: vpush.lo.32 x0, r12, x0
; CHECK-NEXT: vpush.lo.32 x0, r11, x0
; CHECK-NEXT: vpush.lo.32 x0, r10, x0
; CHECK-NEXT: vpush.lo.32 x0, r9, x0
; CHECK-NEXT: vpush.lo.32 x0, r8, x0
; CHECK-NEXT: vpush.lo.32 x0, r7, x0
; CHECK-NEXT: vpush.lo.32 x0, r6, x0
; CHECK-NEXT: vpush.lo.32 x0, r5, x0
; CHECK-NEXT: vpush.lo.32 x0, r4, x0
; CHECK-NEXT: ret lr
; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5
; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4
; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3
; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2
; CHECK-NEXT: mov r16, r24 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%cmp = icmp eq i32 %idx, 0
Expand Down

0 comments on commit 0417b1c

Please sign in to comment.