Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RISCV] Recurse on second operand of two operand shuffles #79197

Merged
merged 1 commit into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 17 additions & 62 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4975,12 +4975,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,

// As a backup, shuffles can be lowered via a vrgather instruction, possibly
// merged with a second vrgather.
SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;

// Keep a track of which non-undef indices are used by each LHS/RHS shuffle
// half.
DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;

SmallVector<int> ShuffleMaskLHS, ShuffleMaskRHS;
SmallVector<SDValue> MaskVals;

// Now construct the mask that will be used by the blended vrgather operation.
Expand All @@ -4989,28 +4984,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps;
MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
? DAG.getConstant(MaskIndex, DL, XLenVT)
: DAG.getUNDEF(XLenVT));
GatherIndicesRHS.push_back(
IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
: DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
if (IsLHSOrUndefIndex && MaskIndex >= 0)
++LHSIndexCounts[MaskIndex];
if (!IsLHSOrUndefIndex)
++RHSIndexCounts[MaskIndex - NumElts];
ShuffleMaskLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
? MaskIndex : -1);
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
}

if (SwapOps) {
std::swap(V1, V2);
std::swap(GatherIndicesLHS, GatherIndicesRHS);
std::swap(ShuffleMaskLHS, ShuffleMaskRHS);
}

assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);

unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
MVT IndexVT = VT.changeTypeToInteger();
// Since we can't introduce illegal index types at this stage, use i16 and
Expand Down Expand Up @@ -5038,6 +5025,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// are handled above.
if (V2.isUndef()) {
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
SmallVector<SDValue> GatherIndicesLHS;
for (int ShuffleIdx : ShuffleMaskLHS)
GatherIndicesLHS.push_back(ShuffleIdx != -1
? DAG.getConstant(ShuffleIdx, DL, XLenVT)
: DAG.getUNDEF(XLenVT));
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
Subtarget);
Expand All @@ -5046,50 +5038,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
}

// Translate the gather index we computed above (and possibly swapped)
// back to a shuffle mask. This step should disappear once we complete
// the migration to recursive design.
SmallVector<int> ShuffleMaskLHS;
ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
for (SDValue GatherIndex : GatherIndicesLHS) {
if (GatherIndex.isUndef()) {
ShuffleMaskLHS.push_back(-1);
continue;
}
auto *IdxC = cast<ConstantSDNode>(GatherIndex);
ShuffleMaskLHS.push_back(IdxC->getZExtValue());
}

// Recursively invoke lowering for the LHS as if there were no RHS.
// This allows us to leverage all of our single source permute tricks.
SDValue Gather =
DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);

// Blend in second vector source with an additional vrgather.
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);

MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
SelectMask =
convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);

// If only one index is used, we can use a "splat" vrgather.
// TODO: We can splat the most-common index and fix-up any stragglers, if
// that's beneficial.
if (RHSIndexCounts.size() == 1) {
int SplatIndex = RHSIndexCounts.begin()->getFirst();
Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
SelectMask, VL);
} else {
SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
RHSIndices =
convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
SelectMask, VL);
}

return convertFromScalableVector(VT, Gather, DAG, Subtarget);
// Recursively invoke lowering for each operand if we had two
// independent single source permutes, and then combine the result via a
// vselect. Note that the vselect will likely be folded back into the
// second permute (vrgather, or other) by the post-isel combine.
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), ShuffleMaskRHS);
return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V2, V1);
Copy link
Contributor

@lukel97 lukel97 Jan 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An aside, do you have a plan for handling cases where one shuffle is an identity op? e.g.

%shuffle = shufflevector <4 x i64> %0, <4 x i64> %1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>

After this patch we get

	vsetivli	zero, 4, e64, m1, ta, ma
	vmv.v.i	v0, 12
	vslidedown.vi	v8, v8, 2
	vmerge.vvm	v8, v8, v9, v0

But we can do this in one slidedown:

	vsetivli	zero, 2, e64, m1, ta, ma
	vslidedown.vi	v9, v8, 2

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Part of the point of this patch series was to better handle identity permutes. :)

Your example looks less like a case where we fail to realize one side is a identity permute, and more like we fail to realize that the vmerge can be folded back into the LHS in this case.

One simple thing we could do - extend the swap logic to prefer the identity on the LHS. This feels more like hiding the real issue than a deep fix, but it wouldn't be hard to do.

Can you file a bug for this? I don't plan to look at it immediately, but it'd be good not to forget it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your example looks less like a case where we fail to realize one side is a identity permute, and more like we fail to realize that the vmerge can be folded back into the LHS in this case.

That was my thinking as well. We could do this as a generic MI transform on PseudoVMERGE in RISCVFoldMasks.cpp when the mask is emulating a VL tail truncation, e.g:

%t = VFOO ...
%f = VBAR pt=undef, ...
%v = VMERGE undef, %f, %t, mask=0b1100, vl=4
// -->
%t = VFOO ...
%v = VBAR %t, ..., vl=...2

i.e. a mask where the MSBs emulate taking the tail from the %true operand.

I don't think we'll generate many vmerges with this type of mask outside of this shuffle lowering, but this is one way of doing it without SelectionDAG.

}

bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
Expand Down
36 changes: 27 additions & 9 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,26 +238,44 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-LABEL: interleave_v32f32:
; V128: # %bb.0:
; V128-NEXT: addi sp, sp, -16
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the diff I was originally concerned by, but thinking about it further, I think this is fine. All of the increases in register pressure happen in cases where we're legalizing a shuffle which is more than m8. Such shuffles aren't going to be real common in auto-vectorized code.

; V128-NEXT: .cfi_def_cfa_offset 16
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; V128-NEXT: vmv8r.v v0, v16
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; V128-NEXT: vmv8r.v v16, v8
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: vslidedown.vi v8, v0, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
; V128-NEXT: lui a1, %hi(.LCPI10_0)
; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
; V128-NEXT: li a2, 32
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; V128-NEXT: vle16.v v12, (a1)
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v16, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v8, v0, v16
; V128-NEXT: vwmaccu.vx v8, a0, v16
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: li a2, 32
; V128-NEXT: vmv.s.x v0, a1
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: addi a1, sp, 16
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; V128-NEXT: vwaddu.vv v0, v16, v8
; V128-NEXT: vwmaccu.vx v0, a0, v8
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: add sp, sp, a0
; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32f32:
Expand Down
36 changes: 27 additions & 9 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -403,26 +403,44 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-LABEL: interleave_v32i32:
; V128: # %bb.0:
; V128-NEXT: addi sp, sp, -16
; V128-NEXT: .cfi_def_cfa_offset 16
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; V128-NEXT: vmv8r.v v0, v16
; V128-NEXT: addi a0, sp, 16
; V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; V128-NEXT: vmv8r.v v16, v8
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: vslidedown.vi v8, v0, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v24, v0, v8
; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v24, a0, v8
; V128-NEXT: lui a1, %hi(.LCPI17_0)
; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
; V128-NEXT: li a2, 32
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
; V128-NEXT: vle16.v v12, (a1)
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; V128-NEXT: vslidedown.vi v0, v16, 16
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v8, v0, v16
; V128-NEXT: vwmaccu.vx v8, a0, v16
; V128-NEXT: lui a1, 699051
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: li a2, 32
; V128-NEXT: vmv.s.x v0, a1
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: addi a1, sp, 16
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; V128-NEXT: vwaddu.vv v0, v16, v8
; V128-NEXT: vwmaccu.vx v0, a0, v8
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: add sp, sp, a0
; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i32:
Expand Down
23 changes: 8 additions & 15 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -611,12 +611,10 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a0, 224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v10, v10, -4
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
ret <8 x i8> %res
Expand All @@ -625,12 +623,10 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_end_non_contiguous:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: li a0, 144
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vadd.vi v10, v10, -4
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
ret <8 x i8> %res
Expand Down Expand Up @@ -670,12 +666,11 @@ define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: li a0, 195
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
ret <8 x i8> %res
Expand All @@ -686,12 +681,10 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vadd.vi v10, v10, -1
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
; CHECK-NEXT: vslidedown.vi v8, v8, 2
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
ret <8 x i8> %res
Expand Down
Loading
Loading