diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ddc384096f4c..10895cbc21f1 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -424,6 +424,35 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { MI.eraseFromParent(); return true; } + + // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES + // Take the first halfs of the two vectors and concatenate them into one + // vector. + std::function()> FirstEightA = + adderGenerator(0, (DstNumElts / 2) - 1, 1); + std::function()> FirstEightB = + adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1); + + std::function()> FirstAndThird = + concatGenerators(SmallVector()>>{ + FirstEightA, FirstEightB}); + if (matchCombineShuffleVectorSimple(MI, FirstAndThird, + (DstNumElts / 2) - 1)) { + if (DstNumElts <= 2) + return false; + const Register DstReg = MI.getOperand(0).getReg(); + const LLT HalfSrcTy = + LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType()); + const Register HalfOfA = + createUnmergeValue(MI, MI.getOperand(1).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0); + const Register HalfOfB = + createUnmergeValue(MI, MI.getOperand(2).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0); + Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB}); + MI.eraseFromParent(); + return true; + } return false; } diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir index 050430a25188..afe0108d6854 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -285,3 +285,66 @@ body: | %1:_(<128 x s8>) = COPY $y2 %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) PseudoRET implicit $lr, implicit %2 +... + +--- +name: insert_vector_16_elements +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191) + PseudoRET implicit $lr, implicit %3 diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index 601cebab049c..6c967b35fad1 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -48,100 +48,27 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-LABEL: test_insert_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; jz r0, #.LBB1_2 -; CHECK-NEXT: mov r24, r16 // Delay Slot 5 -; CHECK-NEXT: mov r25, r17 // Delay Slot 4 -; CHECK-NEXT: mov r26, r18 // Delay Slot 3 -; CHECK-NEXT: mov r27, r19 // Delay Slot 2 -; CHECK-NEXT: mova r16, #0 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: nopx ; vextract.s32 r0, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: mova r17, #5 -; CHECK-NEXT: mova r19, #6 -; CHECK-NEXT: mova r18, #0 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: vextract.s32 r8, x4, r18 -; CHECK-NEXT: movx r16, #2 -; CHECK-NEXT: mova r18, #1 -; CHECK-NEXT: vextract.s32 r5, x2, r17 -; CHECK-NEXT: vextract.s32 r6, x2, r19 -; CHECK-NEXT: vextract.s32 r13, x4, r17 -; CHECK-NEXT: vextract.s32 r15, x4, r19 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x4, r18 -; CHECK-NEXT: movx r16, #3 -; CHECK-NEXT: mova r18, #2 -; CHECK-NEXT: vextract.s32 r10, x4, r18 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: movx r18, #3 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r11, x4, r18 -; CHECK-NEXT: j #.LBB1_3 -; CHECK-NEXT: mova r18, #4 // Delay Slot 5 -; CHECK-NEXT: vextract.s32 r7, x2, r16 // Delay Slot 4 -; CHECK-NEXT: vextract.s32 r12, x4, r18 // Delay Slot 3 -; CHECK-NEXT: vextract.s32 r14, x4, r16 // Delay Slot 2 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: mova r17, #5 -; CHECK-NEXT: mova r19, #6 -; CHECK-NEXT: mova r18, #0 -; CHECK-NEXT: vextract.s32 r1, x4, r16 -; CHECK-NEXT: vextract.s32 r8, x2, r18 -; CHECK-NEXT: movx r16, #2 -; CHECK-NEXT: mova r18, #1 -; CHECK-NEXT: vextract.s32 r5, x4, r17 -; CHECK-NEXT: vextract.s32 r6, x4, r19 -; CHECK-NEXT: vextract.s32 r13, x2, r17 -; CHECK-NEXT: vextract.s32 r15, x2, r19 -; CHECK-NEXT: vextract.s32 r2, x4, r16 -; CHECK-NEXT: vextract.s32 r9, x2, r18 -; CHECK-NEXT: movx r16, #3 -; CHECK-NEXT: mova r18, #2 -; CHECK-NEXT: vextract.s32 r3, x4, r16 -; CHECK-NEXT: vextract.s32 r10, x2, r18 -; CHECK-NEXT: movx r16, #4 -; CHECK-NEXT: mova r18, #3 -; CHECK-NEXT: vextract.s32 r4, x4, r16 -; CHECK-NEXT: vextract.s32 r11, x2, r18 -; CHECK-NEXT: movx r16, #7 -; CHECK-NEXT: mova r18, #4 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: vextract.s32 r12, x2, r18 -; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_3: // %cleanup -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv -; CHECK-NEXT: mov r18, r26 -; CHECK-NEXT: mov r17, r25 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> %cmp = icmp eq i32 %idx, 0