From 70e1ba53ec56a6f808d86b0afe14b8178aa9dd80 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 23 Jan 2024 11:19:33 -0800 Subject: [PATCH] [RISCV] Recurse on second operand of two operand shuffles This builds on bdc41106ee48dce59c500c9a3957af947f30c8c3. This change completes the migration to a recursive shuffle lowering strategy where when we encounter an unknown two argument shuffle, we lower each operand as a single source permute, and then use a vselect (i.e. a vmerge) to combine the results. This relies for code quality on the post-isel combine which will aggressively fold that vmerge back into the materialization of the second operand if possible. Note: The change includes only the most immediately obvious of the stylistic cleanup. There's a bunch of code movement that this enables that I'll do as a separate patch as rolling it into this creates an unreadable diff. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 79 +---- .../RISCV/rvv/fixed-vectors-fp-interleave.ll | 36 +- .../RISCV/rvv/fixed-vectors-int-interleave.ll | 36 +- .../RISCV/rvv/fixed-vectors-int-shuffles.ll | 23 +- .../rvv/fixed-vectors-interleaved-access.ll | 333 ++++++++---------- .../rvv/fixed-vectors-shuffle-exact-vlen.ll | 7 +- .../rvv/fixed-vectors-shuffle-transpose.ll | 129 +++---- .../rvv/fixed-vectors-shufflevector-vnsrl.ll | 36 +- .../RISCV/rvv/vector-deinterleave-fixed.ll | 8 +- 9 files changed, 296 insertions(+), 391 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e39888637062c6..c91a66e26f982b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4975,12 +4975,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // As a backup, shuffles can be lowered via a vrgather instruction, possibly // merged with a second vrgather. - SmallVector GatherIndicesLHS, GatherIndicesRHS; - - // Keep a track of which non-undef indices are used by each LHS/RHS shuffle - // half. - DenseMap LHSIndexCounts, RHSIndexCounts; - + SmallVector ShuffleMaskLHS, ShuffleMaskRHS; SmallVector MaskVals; // Now construct the mask that will be used by the blended vrgather operation. @@ -4989,28 +4984,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps; MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT)); bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts; - GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0 - ? DAG.getConstant(MaskIndex, DL, XLenVT) - : DAG.getUNDEF(XLenVT)); - GatherIndicesRHS.push_back( - IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT) - : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT)); - if (IsLHSOrUndefIndex && MaskIndex >= 0) - ++LHSIndexCounts[MaskIndex]; - if (!IsLHSOrUndefIndex) - ++RHSIndexCounts[MaskIndex - NumElts]; + ShuffleMaskLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0 + ? MaskIndex : -1); + ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts)); } if (SwapOps) { std::swap(V1, V2); - std::swap(GatherIndicesLHS, GatherIndicesRHS); + std::swap(ShuffleMaskLHS, ShuffleMaskRHS); } assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle"); MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals); - unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL; unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL; MVT IndexVT = VT.changeTypeToInteger(); // Since we can't introduce illegal index types at this stage, use i16 and @@ -5038,6 +5025,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // are handled above. if (V2.isUndef()) { V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); + SmallVector GatherIndicesLHS; + for (int ShuffleIdx : ShuffleMaskLHS) + GatherIndicesLHS.push_back(ShuffleIdx != -1 + ? DAG.getConstant(ShuffleIdx, DL, XLenVT) + : DAG.getUNDEF(XLenVT)); SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); @@ -5046,50 +5038,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, Gather, DAG, Subtarget); } - // Translate the gather index we computed above (and possibly swapped) - // back to a shuffle mask. This step should disappear once we complete - // the migration to recursive design. - SmallVector ShuffleMaskLHS; - ShuffleMaskLHS.reserve(GatherIndicesLHS.size()); - for (SDValue GatherIndex : GatherIndicesLHS) { - if (GatherIndex.isUndef()) { - ShuffleMaskLHS.push_back(-1); - continue; - } - auto *IdxC = cast(GatherIndex); - ShuffleMaskLHS.push_back(IdxC->getZExtValue()); - } - - // Recursively invoke lowering for the LHS as if there were no RHS. - // This allows us to leverage all of our single source permute tricks. - SDValue Gather = - DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS); - Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget); - - // Blend in second vector source with an additional vrgather. - V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); - - MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); - SelectMask = - convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); - - // If only one index is used, we can use a "splat" vrgather. - // TODO: We can splat the most-common index and fix-up any stragglers, if - // that's beneficial. - if (RHSIndexCounts.size() == 1) { - int SplatIndex = RHSIndexCounts.begin()->getFirst(); - Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, - DAG.getConstant(SplatIndex, DL, XLenVT), Gather, - SelectMask, VL); - } else { - SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); - RHSIndices = - convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); - Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather, - SelectMask, VL); - } - - return convertFromScalableVector(VT, Gather, DAG, Subtarget); + // Recursively invoke lowering for each operand if we had two + // independent single source permutes, and then combine the result via a + // vselect. Note that the vselect will likely be folded back into the + // second permute (vrgather, or other) by the post-isel combine. + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), ShuffleMaskRHS); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V2, V1); } bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index dab530751ef96b..6bfd0ac932672f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -238,26 +238,44 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) { define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-LABEL: interleave_v32f32: ; V128: # %bb.0: +; V128-NEXT: addi sp, sp, -16 +; V128-NEXT: .cfi_def_cfa_offset 16 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 3 +; V128-NEXT: sub sp, sp, a0 +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; V128-NEXT: vmv8r.v v0, v16 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; V128-NEXT: vmv8r.v v16, v8 ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v8, 16 +; V128-NEXT: vslidedown.vi v8, v0, 16 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v24, v0, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v24, a0, v8 -; V128-NEXT: lui a1, %hi(.LCPI10_0) -; V128-NEXT: addi a1, a1, %lo(.LCPI10_0) -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; V128-NEXT: vle16.v v12, (a1) +; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v8, v0, v16 +; V128-NEXT: vwmaccu.vx v8, a0, v16 ; V128-NEXT: lui a1, 699051 ; V128-NEXT: addi a1, a1, -1366 +; V128-NEXT: li a2, 32 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; V128-NEXT: vmerge.vvm v24, v8, v24, v0 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v0, v8, v16 -; V128-NEXT: vwmaccu.vx v0, a0, v16 +; V128-NEXT: addi a1, sp, 16 +; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; V128-NEXT: vwaddu.vv v0, v16, v8 +; V128-NEXT: vwmaccu.vx v0, a0, v8 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 3 +; V128-NEXT: add sp, sp, a0 +; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32f32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index 9e21cc9e3d624a..6da83644413bc2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -403,26 +403,44 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-LABEL: interleave_v32i32: ; V128: # %bb.0: +; V128-NEXT: addi sp, sp, -16 +; V128-NEXT: .cfi_def_cfa_offset 16 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 3 +; V128-NEXT: sub sp, sp, a0 +; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; V128-NEXT: vmv8r.v v0, v16 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; V128-NEXT: vmv8r.v v16, v8 ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v8, 16 +; V128-NEXT: vslidedown.vi v8, v0, 16 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; V128-NEXT: vwaddu.vv v24, v0, v8 ; V128-NEXT: li a0, -1 ; V128-NEXT: vwmaccu.vx v24, a0, v8 -; V128-NEXT: lui a1, %hi(.LCPI17_0) -; V128-NEXT: addi a1, a1, %lo(.LCPI17_0) -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; V128-NEXT: vle16.v v12, (a1) +; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; V128-NEXT: vwaddu.vv v8, v0, v16 +; V128-NEXT: vwmaccu.vx v8, a0, v16 ; V128-NEXT: lui a1, 699051 ; V128-NEXT: addi a1, a1, -1366 +; V128-NEXT: li a2, 32 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; V128-NEXT: vmerge.vvm v24, v8, v24, v0 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v0, v8, v16 -; V128-NEXT: vwmaccu.vx v0, a0, v16 +; V128-NEXT: addi a1, sp, 16 +; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; V128-NEXT: vwaddu.vv v0, v16, v8 +; V128-NEXT: vwmaccu.vx v0, a0, v8 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 +; V128-NEXT: csrr a0, vlenb +; V128-NEXT: slli a0, a0, 3 +; V128-NEXT: add sp, sp, a0 +; V128-NEXT: addi sp, sp, 16 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v32i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a26a87a1f3c139..23629ace4cf48c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -611,12 +611,10 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: concat_4xi8_start_undef_at_start: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 224 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -4 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -625,12 +623,10 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_start_into_end_non_contiguous: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 144 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -4 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -670,12 +666,11 @@ define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_slidedown: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: li a0, 195 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -686,12 +681,10 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v10, v10, -1 ; CHECK-NEXT: li a0, 234 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vslidedown.vi v8, v8, 2 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index f889041647b235..e27ff0a573d5f2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -159,69 +159,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 58 +; RV32-NEXT: li a3, 56 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a3) +; RV32-NEXT: vle32.v v16, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 25 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v16, v8, 4 +; RV32-NEXT: vslideup.vi v8, v16, 4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 12 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vid.v v20 -; RV32-NEXT: vadd.vi v4, v20, -10 -; RV32-NEXT: vmv.v.v v2, v20 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 4 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs2r.v v20, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 12 ; RV32-NEXT: vmv.s.x v1, a4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a5, a4, 5 -; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 2 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs1r.v v1, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vrgatherei16.vv v16, v8, v4, v0.t +; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 21 +; RV32-NEXT: li a5, 20 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, %hi(.LCPI6_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 2 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill @@ -233,14 +221,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 49 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -248,12 +236,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a5, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a3, 12 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload @@ -263,366 +252,330 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 20 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 +; RV32-NEXT: li a3, 20 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v8, 2 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v2, -8 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v8, v16, 2 ; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v20, v12 +; RV32-NEXT: vslideup.vi v8, v16, 8, v0.t +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a3, 12 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a3, 12 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v8, -6 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t +; RV32-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vslideup.vi v12, v16, 6, v0.t +; RV32-NEXT: vmv.v.v v4, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v28, (a3) -; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs1r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 960 +; RV32-NEXT: vmv.s.x v2, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vrgatherei16.vv v8, v16, v24 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a3, 48 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI6_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v4, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vi v8, v8, -4 -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI6_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v28, v24, v8 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v24, v12 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v28, v8, 4, v0.t +; RV32-NEXT: vmv.v.v v4, v28 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_9) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_9) -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vle16.v v28, (a3) +; RV32-NEXT: vle16.v v28, (a1) +; RV32-NEXT: vle16.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v28 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: vmv.v.v v4, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 4 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: lui a1, 15 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vmv.s.x v5, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v28, v24, 6 +; RV32-NEXT: vmv1r.v v0, v5 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v20, v16, 6 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v28, v8, v2, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) -; RV32-NEXT: vle16.v v24, (a1) -; RV32-NEXT: vle16.v v16, (a3) +; RV32-NEXT: vle16.v v0, (a1) +; RV32-NEXT: vle16.v v24, (a3) ; RV32-NEXT: li a1, 1008 -; RV32-NEXT: vmv.s.x v28, a1 +; RV32-NEXT: vmv.s.x v4, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs1r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 49 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 -; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v0 +; RV32-NEXT: vmv1r.v v0, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v28, v8 ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vmv1r.v v0, v5 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_14) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_14) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a2, %hi(.LCPI6_15) ; RV32-NEXT: addi a2, a2, %lo(.LCPI6_15) -; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: vle16.v v8, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 5 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 49 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 25 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 41 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 5 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v16, (a1) +; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 4 -; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: slli a2, a2, 2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: li a3, 12 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 21 +; RV32-NEXT: li a2, 20 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 58 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index f53b51e05c5726..c8c08ab1a987b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -152,15 +152,12 @@ define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_slide_two_source: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vadd.vi v14, v12, -1 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vrgather.vi v12, v8, 0 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v10, v14, v0.t +; CHECK-NEXT: vslideup.vi v12, v10, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll index a34fa9502d93b3..038fead011d899 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll @@ -7,12 +7,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn1.v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -21,12 +19,11 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: trn2.v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 @@ -35,15 +32,12 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: trn1.v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v10, v10, -1 ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -58,9 +52,8 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-NEXT: addi a0, a0, -1366 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 @@ -69,11 +62,10 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn1.v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -82,11 +74,10 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: trn2.v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 @@ -95,12 +86,10 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn1.v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -109,12 +98,11 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: trn2.v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 @@ -133,11 +121,10 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: trn2.v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ret <2 x i32> %tmp0 @@ -146,11 +133,10 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn1.v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -159,11 +145,10 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: trn2.v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 @@ -182,11 +167,10 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) { define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: trn2.v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ret <2 x i64> %tmp0 @@ -205,11 +189,10 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK-LABEL: trn2.v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ret <2 x float> %tmp0 @@ -218,11 +201,10 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn1.v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -231,11 +213,10 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: trn2.v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 @@ -254,11 +235,10 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) { define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { ; CHECK-LABEL: trn2.v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vrgather.vi v10, v8, 1 -; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ret <2 x double> %tmp0 @@ -267,11 +247,10 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn1.v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -280,11 +259,10 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: trn2.v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 @@ -293,12 +271,10 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn1.v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 170 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v10, -1 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 @@ -307,12 +283,11 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: trn2.v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index ff6a9dc38999c2..25af5ba2f079f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -179,12 +179,11 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) { ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; ZVE32F-NEXT: vmv.v.i v0, 2 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vrgather.vi v9, v8, 1 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2, v0.t +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -234,12 +233,11 @@ define void @vnsrl_32_float(ptr %in, ptr %out) { ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 ; ZVE32F-NEXT: vmv.v.i v0, 2 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vrgather.vi v9, v8, 1 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2, v0.t +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -278,12 +276,11 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) { ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 ; V-NEXT: vmv.v.i v0, 2 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vrgather.vi v9, v8, 1 +; V-NEXT: vslidedown.vi v9, v8, 2, v0.t +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_i64: @@ -330,12 +327,11 @@ define void @vnsrl_64_double(ptr %in, ptr %out) { ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 ; V-NEXT: vmv.v.i v0, 2 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vrgather.vi v9, v8, 1 +; V-NEXT: vslidedown.vi v9, v8, 2, v0.t +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_double: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 84e9e2801ff6db..f3c70ed78c747f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -82,9 +82,9 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) @@ -169,9 +169,9 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double ; CHECK-NEXT: vslidedown.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrgather.vi v9, v8, 1 -; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)