Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Added SVE Prefetch* APIs. #103094

Merged
merged 20 commits into from
Jun 12, 2024
8 changes: 8 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,14 @@ void HWIntrinsicInfo::lookupImmBounds(
}
break;

case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
immLowerBound = (int)SVE_PRFOP_PLDL1KEEP;
immUpperBound = (int)SVE_PRFOP_CONST15;
break;

default:
unreached();
}
Expand Down
27 changes: 27 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1489,6 +1489,33 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
{
assert(hasImmediateOperand);
assert(HWIntrinsicInfo::HasEnumOperand(intrin.id));
if (intrin.op3->IsCnsIntOrI())
{
GetEmitter()->emitIns_PRFOP_R_R_I(ins, emitSize,
(insSvePrfop)intrin.op3->AsIntConCommon()->IconValue(), op1Reg,
op2Reg, 0);
}
else
{
assert(!intrin.op3->isContainedIntOrIImmed());

HWIntrinsicImmOpHelper helper(this, intrin.op3, node);
for (helper.EmitBegin(); !helper.Done(); helper.EmitCaseEnd())
{
const insSvePrfop prfop = (insSvePrfop)helper.ImmValue();
GetEmitter()->emitIns_PRFOP_R_R_I(ins, emitSize, prfop, op1Reg, op2Reg, 0);
}
}
break;
}

case NI_Vector64_ToVector128:
GetEmitter()->emitIns_Mov(ins, emitSize, targetReg, op1Reg, /* canSkip */ false);
break;
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ HARDWARE_INTRINSIC(Sve, Negate,
HARDWARE_INTRINSIC(Sve, Or, -1, -1, false, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, false, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, false, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PrefetchBytes, -1, 3, false, {INS_invalid, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt16, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_prfh, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_prfw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_prfd, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, true, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, ReverseElement16, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ReverseElement32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revw, INS_sve_revw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3194,6 +3194,10 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x2:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x3:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op3));
if (intrin.op3->IsCnsIntOrI())
Expand Down
51 changes: 33 additions & 18 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x2:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x3:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
break;

Expand Down Expand Up @@ -1965,28 +1969,39 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
(argNum == lowVectorOperandNum) ? lowVectorCandidates : RBM_NONE);
}
}
else if (intrin.id == NI_Sve_StoreAndZip)
{
srcCount += BuildAddrUses(intrin.op2);
}
else
{
SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;
switch (intrin.id)
TIHan marked this conversation as resolved.
Show resolved Hide resolved
{
case NI_Sve_StoreAndZip:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
srcCount += BuildAddrUses(intrin.op2);
break;

if (intrin.op2->gtType == TYP_MASK)
{
assert(lowVectorOperandNum != 2);
candidates = RBM_ALLMASK;
}
default:
{
SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;

if (forceOp2DelayFree)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
}
else
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
: BuildOperandUses(intrin.op2, candidates);
if (intrin.op2->gtType == TYP_MASK)
{
assert(lowVectorOperandNum != 2);
candidates = RBM_ALLMASK;
}

if (forceOp2DelayFree)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
}
else
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
: BuildOperandUses(intrin.op2, candidates);
}
}
break;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,20 @@ public enum SveMaskPattern : byte
/// </summary>
All = 31 // All available (implicitly a multiple of two).
}

public enum SvePrefetchType : byte
{
SV_PLDL1KEEP = 0, // Temporal fetch the addressed location for reading, to L1 cache.
SV_PLDL1STRM = 1, // Streaming fetch the addressed location for reading, to L1 cache.
SV_PLDL2KEEP = 2, // Temporal fetch the addressed location for reading, to L2 cache.
SV_PLDL2STRM = 3, // Streaming fetch the addressed location for reading, to L2 cache.
SV_PLDL3KEEP = 4, // Temporal fetch the addressed location for reading, to L3 cache.
SV_PLDL3STRM = 5, // Streaming fetch the addressed location for reading, to L3 cache.
SV_PSTL1KEEP = 8, // Temporal fetch the addressed location for writing, to L1 cache.
SV_PSTL1STRM = 9, // Streaming fetch the addressed location for writing, to L1 cache.
SV_PSTL2KEEP = 10, // Temporal fetch the addressed location for writing, to L2 cache.
SV_PSTL2STRM = 11, // Streaming fetch the addressed location for writing, to L2 cache.
SV_PSTL3KEEP = 12, // Temporal fetch the addressed location for writing, to L3 cache.
SV_PSTL3STRM = 13 // Streaming fetch the addressed location for writing, to L3 cache.
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -3132,6 +3132,30 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> PopCount(Vector<ulong> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfb(svbool_t pg, const void *base, enum svprfop op)
/// PRFB op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchBytes(Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfh(svbool_t pg, const void *base, enum svprfop op)
/// PRFH op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt16(Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfw(svbool_t pg, const void *base, enum svprfop op)
/// PRFW op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt32(Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfd(svbool_t pg, const void *base, enum svprfop op)
/// PRFD op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt64(Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }


/// Reverse all elements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3188,6 +3188,29 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> PopCount(Vector<ulong> value) => PopCount(value);

/// <summary>
/// void svprfb(svbool_t pg, const void *base, enum svprfop op)
/// PRFB op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchBytes(Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchBytes(mask, address, prefetchType);

/// <summary>
/// void svprfh(svbool_t pg, const void *base, enum svprfop op)
/// PRFH op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt16(Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt16(mask, address, prefetchType);

/// <summary>
/// void svprfw(svbool_t pg, const void *base, enum svprfop op)
/// PRFW op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt32(Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt32(mask, address, prefetchType);

/// <summary>
/// void svprfd(svbool_t pg, const void *base, enum svprfop op)
/// PRFD op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt64(Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt64(mask, address, prefetchType);

/// Reverse all elements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4608,6 +4608,11 @@ internal Arm64() { }
public static System.Numerics.Vector<ulong> PopCount(System.Numerics.Vector<long> value) { throw null; }
public static System.Numerics.Vector<ulong> PopCount(System.Numerics.Vector<ulong> value) { throw null; }

public static unsafe void PrefetchBytes(System.Numerics.Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt16(System.Numerics.Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt32(System.Numerics.Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt64(System.Numerics.Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }

public static System.Numerics.Vector<byte> ReverseElement(System.Numerics.Vector<byte> value) { throw null; }
public static System.Numerics.Vector<double> ReverseElement(System.Numerics.Vector<double> value) { throw null; }
public static System.Numerics.Vector<short> ReverseElement(System.Numerics.Vector<short> value) { throw null; }
Expand Down Expand Up @@ -4940,6 +4945,22 @@ public enum SveMaskPattern : byte
LargestMultipleOf3 = 30, // The largest multiple of 3.
All = 31 // All available (implicitly a multiple of two).
};

public enum SvePrefetchType : byte
Copy link
Contributor Author

@TIHan TIHan Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kunalspathak @tannergooding was this enum type approved? I ask because of the SV_PLDL1KEEP names.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a note here about they not approved? @tannergooding can you confirm?

#94006 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#97831 approved the Prefetch* APIs.

The enum should have been approved as part of that, but wasn't looked at directly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The enum names are different and were approved in #94007 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TIHan - can you update the names accordingly?

{
SV_PLDL1KEEP = 0, // Temporal fetch the addressed location for reading, to L1 cache.
SV_PLDL1STRM = 1, // Streaming fetch the addressed location for reading, to L1 cache.
SV_PLDL2KEEP = 2, // Temporal fetch the addressed location for reading, to L2 cache.
SV_PLDL2STRM = 3, // Streaming fetch the addressed location for reading, to L2 cache.
SV_PLDL3KEEP = 4, // Temporal fetch the addressed location for reading, to L3 cache.
SV_PLDL3STRM = 5, // Streaming fetch the addressed location for reading, to L3 cache.
SV_PSTL1KEEP = 8, // Temporal fetch the addressed location for writing, to L1 cache.
SV_PSTL1STRM = 9, // Streaming fetch the addressed location for writing, to L1 cache.
SV_PSTL2KEEP = 10, // Temporal fetch the addressed location for writing, to L2 cache.
SV_PSTL2STRM = 11, // Streaming fetch the addressed location for writing, to L2 cache.
SV_PSTL3KEEP = 12, // Temporal fetch the addressed location for writing, to L3 cache.
SV_PSTL3STRM = 13 // Streaming fetch the addressed location for writing, to L3 cache.
};
}
namespace System.Runtime.Intrinsics.X86
{
Expand Down
Loading
Loading