Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avx512 single kmask #82255

Closed
wants to merge 12 commits into from
2 changes: 2 additions & 0 deletions src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3221,6 +3221,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree)
}

case TYP_SIMD32:
case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val.
{
simd32_t value = vnStore->ConstantValue<simd32_t>(vnCns);

Expand All @@ -3231,6 +3232,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree)
break;
}
break;

#endif // FEATURE_SIMD

case TYP_BYREF:
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void genSSE2Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
void genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node);
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node);
void genAESIntrinsic(GenTreeHWIntrinsic* node);
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node);
void genFMAIntrinsic(GenTreeHWIntrinsic* node);
Expand Down
36 changes: 32 additions & 4 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,22 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre

if (vecCon->IsZero())
{
if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
bool isSupported;

if (attr == EA_32BYTE)
{
isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX);
}
else if (attr == EA_64BYTE)
{
isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F);
}
else
{
assert((attr == EA_8BYTE) || (attr == EA_16BYTE));
isSupported = true;
}
if (isSupported)
{
#if defined(FEATURE_SIMD)
emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
Expand Down Expand Up @@ -551,6 +566,18 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
break;
}

case TYP_SIMD64:
{
simd64_t constValue;
// TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val.
constValue.v256[0] = vecCon->gtSimd32Val;
constValue.v256[1] = vecCon->gtSimd32Val;
CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue);

emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
break;
}
#endif // FEATURE_SIMD

default:
Expand Down Expand Up @@ -5778,9 +5805,10 @@ void CodeGen::genCall(GenTreeCall* call)
// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && GetEmitter()->Contains256bitAVX())
// This applies to 512bit AVX512 instructions as well.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
{
assert(compiler->canUseVexEncoding());
assert(GetEmitter()->Contains256bitOrMoreAVX() && compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
}

Expand Down Expand Up @@ -11064,7 +11092,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
bool emitVzeroUpper = false;
if (check256bitOnly)
{
emitVzeroUpper = GetEmitter()->Contains256bitAVX();
emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
}
else
{
Expand Down
22 changes: 20 additions & 2 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2280,6 +2280,24 @@ void Compiler::compSetProcessor()
{
instructionSetFlags.AddInstructionSet(InstructionSet_Vector256);
}
if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F) &&
instructionSetFlags.HasInstructionSet(InstructionSet_AVX512BW) &&
instructionSetFlags.HasInstructionSet(InstructionSet_AVX512CD) &&
instructionSetFlags.HasInstructionSet(InstructionSet_AVX512DQ))
{
if (!DoJitStressEvexEncoding())
{
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ);
instructionSetFlags = EnsureInstructionSetFlagsAreValid(instructionSetFlags);
}
else
{
instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
}
}
#elif defined(TARGET_ARM64)
if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd))
{
Expand All @@ -2297,14 +2315,14 @@ void Compiler::compSetProcessor()
if (canUseEvexEncoding())
{
codeGen->GetEmitter()->SetUseEvexEncoding(true);
// TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added.
// TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added.
}
if (canUseVexEncoding())
{
codeGen->GetEmitter()->SetUseVEXEncoding(true);
// Assume each JITted method does not contain AVX instruction at first
codeGen->GetEmitter()->SetContainsAVX(false);
codeGen->GetEmitter()->SetContains256bitAVX(false);
codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false);
}
}
#endif // TARGET_XARCH
Expand Down
45 changes: 40 additions & 5 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7621,7 +7621,7 @@ class Compiler
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
assert(type != TYP_STRUCT);
return (type == TYP_SIMD32);
return (type == TYP_SIMD32) || (type == TYP_SIMD64);
}
#elif defined(TARGET_ARM64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
Expand Down Expand Up @@ -8318,6 +8318,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
SIMDLevel getSIMDSupportLevel()
{
#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
return SIMD_AVX512F_Supported;
}

if (compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
return SIMD_AVX2_Supported;
Expand Down Expand Up @@ -8435,12 +8440,26 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
CORINFO_CLASS_HANDLE Vector256ULongHandle;
CORINFO_CLASS_HANDLE Vector256NIntHandle;
CORINFO_CLASS_HANDLE Vector256NUIntHandle;

CORINFO_CLASS_HANDLE Vector512FloatHandle;
CORINFO_CLASS_HANDLE Vector512DoubleHandle;
CORINFO_CLASS_HANDLE Vector512IntHandle;
CORINFO_CLASS_HANDLE Vector512UShortHandle;
CORINFO_CLASS_HANDLE Vector512UByteHandle;
CORINFO_CLASS_HANDLE Vector512ShortHandle;
CORINFO_CLASS_HANDLE Vector512ByteHandle;
CORINFO_CLASS_HANDLE Vector512LongHandle;
CORINFO_CLASS_HANDLE Vector512UIntHandle;
CORINFO_CLASS_HANDLE Vector512ULongHandle;
CORINFO_CLASS_HANDLE Vector512NIntHandle;
CORINFO_CLASS_HANDLE Vector512NUIntHandle;
#endif // defined(TARGET_XARCH)
#endif // FEATURE_HW_INTRINSICS

CORINFO_CLASS_HANDLE CanonicalSimd8Handle;
CORINFO_CLASS_HANDLE CanonicalSimd16Handle;
CORINFO_CLASS_HANDLE CanonicalSimd32Handle;
CORINFO_CLASS_HANDLE CanonicalSimd64Handle;

SIMDHandlesCache()
{
Expand Down Expand Up @@ -8506,6 +8525,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
}

case TYP_SIMD32:
case TYP_SIMD64:
break;

default:
Expand Down Expand Up @@ -8611,6 +8631,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return m_simdHandleCache->CanonicalSimd16Handle;
case TYP_SIMD32:
return m_simdHandleCache->CanonicalSimd32Handle;
case TYP_SIMD64:
return m_simdHandleCache->CanonicalSimd64Handle;
default:
unreached();
}
Expand Down Expand Up @@ -8745,7 +8767,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
var_types getSIMDVectorType()
{
#if defined(TARGET_XARCH)
if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
if (getSIMDSupportLevel() == SIMD_AVX512F_Supported)
{
return TYP_SIMD64;
}
else if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
{
return TYP_SIMD32;
}
Expand Down Expand Up @@ -8786,7 +8812,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
unsigned getSIMDVectorRegisterByteLength()
{
#if defined(TARGET_XARCH)
if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
// TODO-XArch-AVX512 : Return ZMM_REGSIZE_BYTES once Vector<T> supports AVX512.
if (getSIMDSupportLevel() >= SIMD_AVX2_Supported)
{
return YMM_REGSIZE_BYTES;
}
Expand Down Expand Up @@ -8815,7 +8842,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
unsigned int maxSIMDStructBytes()
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX))
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
return ZMM_REGSIZE_BYTES;
}
else if (compOpportunisticallyDependsOn(InstructionSet_AVX))
{
return YMM_REGSIZE_BYTES;
}
Expand Down Expand Up @@ -8857,6 +8888,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{
simdType = TYP_SIMD32;
}
else if (size == 64)
{
simdType = TYP_SIMD64;
}
else
{
noway_assert(!"Unexpected size for SIMD type");
Expand Down Expand Up @@ -8892,7 +8927,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// otherwise cause the highest level of instruction set support to be reported to crossgen2.
// and this api is only ever used as an optimization or assert, so no reporting should
// ever happen.
return YMM_REGSIZE_BYTES;
return ZMM_REGSIZE_BYTES;
}
#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
unsigned vectorRegSize = maxSIMDStructBytes();
Expand Down
40 changes: 32 additions & 8 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2612,15 +2612,11 @@ void emitter::emitSetFrameRangeArgs(int offsLo, int offsHi)
*/

const emitter::opSize emitter::emitSizeEncode[] = {
emitter::OPSZ1, emitter::OPSZ2, OPSIZE_INVALID, emitter::OPSZ4, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
emitter::OPSZ8, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, emitter::OPSZ16, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, emitter::OPSZ32,
emitter::OPSZ1, emitter::OPSZ2, emitter::OPSZ4, emitter::OPSZ8, emitter::OPSZ16, emitter::OPSZ32, emitter::OPSZ64,
};

const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE,
EA_8BYTE, EA_16BYTE, EA_32BYTE};
const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, EA_8BYTE,
EA_16BYTE, EA_32BYTE, EA_64BYTE};

/*****************************************************************************
*
Expand Down Expand Up @@ -6548,7 +6544,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,

coldCodeBlock = nullptr;

// This restricts the data alignment to: 4, 8, 16, or 32 bytes
// This restricts the data alignment to: 4, 8, 16, 32 or 64 bytes
// Alignments greater than 32 would require VM support in ICorJitInfo::allocMem
uint32_t dataAlignment = emitConsDsc.alignment;
assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) &&
Expand Down Expand Up @@ -6629,6 +6625,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
{
allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN;
}
else if (dataAlignment == 64)
{
allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN;
}

CorJitAllocMemFlag allocMemFlag = static_cast<CorJitAllocMemFlag>(allocMemFlagCodeAlign | allocMemFlagDataAlign);

Expand Down Expand Up @@ -7976,6 +7976,30 @@ CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue)
#endif // !FEATURE_SIMD
}

CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue)
{
// Access to inline data is 'abstracted' by a special type of static member
// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
// to constant data, not a real static field.
CLANG_FORMAT_COMMENT_ANCHOR;

#if defined(FEATURE_SIMD)
unsigned cnsSize = 64;
unsigned cnsAlign = cnsSize;

#ifdef TARGET_XARCH
if (emitComp->compCodeOpt() == Compiler::SMALL_CODE)
{
cnsAlign = dataSection::MIN_DATA_ALIGN;
}
#endif // TARGET_XARCH
UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64);
return emitComp->eeFindJitDataOffs(cnum);
#else
unreached();
#endif // !FEATURE_SIMD
}

/*****************************************************************************
*
* Output the given data section at the specified address.
Expand Down
12 changes: 7 additions & 5 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,8 @@ class emitter
OPSZ8 = 3,
OPSZ16 = 4,
OPSZ32 = 5,
OPSZ_COUNT = 6,
OPSZ64 = 6,
OPSZ_COUNT = 7,
#ifdef TARGET_AMD64
OPSZP = OPSZ8,
#else
Expand Down Expand Up @@ -2061,6 +2062,7 @@ class emitter
CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue);
CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue);
CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue);
CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue);
regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src);
regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2);
void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem);
Expand Down Expand Up @@ -2674,11 +2676,11 @@ class emitter

struct dataSection
{
// Note to use alignments greater than 32 requires modification in the VM
// Note to use alignments greater than 64 requires modification in the VM
// to support larger alignments (see ICorJitInfo::allocMem)
//
const static unsigned MIN_DATA_ALIGN = 4;
const static unsigned MAX_DATA_ALIGN = 32;
const static unsigned MAX_DATA_ALIGN = 64;

enum sectionType
{
Expand Down Expand Up @@ -2989,9 +2991,9 @@ inline emitAttr emitActualTypeSize(T type)
/* static */ inline emitter::opSize emitter::emitEncodeSize(emitAttr size)
{
assert(size == EA_1BYTE || size == EA_2BYTE || size == EA_4BYTE || size == EA_8BYTE || size == EA_16BYTE ||
size == EA_32BYTE);
size == EA_32BYTE || size == EA_64BYTE);

return emitSizeEncode[((int)size) - 1];
return emitSizeEncode[genLog2(size)];
}

/* static */ inline emitAttr emitter::emitDecodeSize(emitter::opSize ensz)
Expand Down
Loading