From f81815fe67f01aee7e1a08cd735ec2438939096e Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 17 Jul 2023 16:34:52 -0700 Subject: [PATCH 01/16] Allow multiple kmask registers to be allocated and cleanup some codegen around them --- src/coreclr/jit/codegencommon.cpp | 9 +- src/coreclr/jit/codegeninterface.h | 20 +- src/coreclr/jit/compiler.cpp | 15 +- src/coreclr/jit/compiler.h | 37 +++ src/coreclr/jit/emit.cpp | 4 + src/coreclr/jit/emit.h | 9 + src/coreclr/jit/emitxarch.cpp | 6 +- src/coreclr/jit/gentree.cpp | 4 + src/coreclr/jit/hwintrinsic.h | 14 +- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 259 ++++++++++++++++---- src/coreclr/jit/hwintrinsiclistxarch.h | 89 +++---- src/coreclr/jit/hwintrinsicxarch.cpp | 3 +- src/coreclr/jit/lowerxarch.cpp | 234 +++++++++++++----- src/coreclr/jit/lsra.cpp | 6 +- src/coreclr/jit/lsra.h | 54 +++- src/coreclr/jit/lsrabuild.cpp | 31 +++ src/coreclr/jit/lsraxarch.cpp | 10 - src/coreclr/jit/targetamd64.h | 22 +- src/coreclr/jit/targetx86.h | 26 +- 19 files changed, 653 insertions(+), 199 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 7509ddc74f2f9..12c09cd43f278 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -68,14 +68,19 @@ CodeGenInterface::CodeGenInterface(Compiler* theCompiler) { } -#if defined(TARGET_AMD64) +#if defined(TARGET_XARCH) void CodeGenInterface::CopyRegisterInfo() { +#if defined(TARGET_AMD64) rbmAllFloat = compiler->rbmAllFloat; rbmFltCalleeTrash = compiler->rbmFltCalleeTrash; -} #endif // TARGET_AMD64 + rbmAllMask = compiler->rbmAllMask; + rbmMskCalleeTrash = compiler->rbmMskCalleeTrash; +} +#endif // TARGET_XARCH + /*****************************************************************************/ CodeGen::CodeGen(Compiler* theCompiler) : CodeGenInterface(theCompiler) diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index c2bcedb8ea9b7..4cfd462567d5b 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -63,9 +63,6 @@ class CodeGenInterface regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; - // Call this function after the equivalent fields in Compiler have been initialized. - void CopyRegisterInfo(); - regMaskTP get_RBM_ALLFLOAT() const { return this->rbmAllFloat; @@ -76,6 +73,23 @@ class CodeGenInterface } #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) + regMaskTP rbmAllMask; + regMaskTP rbmMskCalleeTrash; + + // Call this function after the equivalent fields in Compiler have been initialized. + void CopyRegisterInfo(); + + regMaskTP get_RBM_ALLMASK() const + { + return this->rbmAllMask; + } + regMaskTP get_RBM_MSK_CALLEE_TRASH() const + { + return this->rbmMskCalleeTrash; + } +#endif // TARGET_XARCH + // genSpillVar is called by compUpdateLifeVar. // TODO-Cleanup: We should handle the spill directly in CodeGen, rather than // calling it from compUpdateLifeVar. Then this can be non-virtual. diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index fb7f2f73351af..9f5fba6f74734 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3375,9 +3375,22 @@ void Compiler::compInitOptions(JitFlags* jitFlags) rbmFltCalleeTrash |= RBM_HIGHFLOAT; cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT; } +#endif // TARGET_AMD64 + +#if defined(TARGET_XARCH) + rbmAllMask = RBM_ALLMASK_INIT; + rbmMskCalleeTrash = RBM_MSK_CALLEE_TRASH_INIT; + cntCalleeTrashMask = CNT_CALLEE_TRASH_MASK_INIT; + + if (canUseEvexEncoding()) + { + rbmAllMask |= RBM_ALLMASK_EVEX; + rbmMskCalleeTrash |= RBM_MSK_CALLEE_TRASH_EVEX; + cntCalleeTrashMask += CNT_CALLEE_TRASH_MASK; + } codeGen->CopyRegisterInfo(); -#endif // TARGET_AMD64 +#endif // TARGET_XARCH } #ifdef DEBUG diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 8a2d862d1fe2c..fbc64f6fb130a 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10910,6 +10910,43 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) +private: + // The following are for initializing register allocator "constants" defined in targetamd64.h + // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which adds + // 8 mask registers for use. + // + // Users of these values need to define four accessor functions: + // + // regMaskTP get_RBM_ALLMASK(); + // regMaskTP get_RBM_MSK_CALLEE_TRASH(); + // unsigned get_CNT_CALLEE_TRASH_MASK(); + // unsigned get_AVAILABLE_REG_COUNT(); + // + // which return the values of these variables. + // + // This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // TARGET_XARCH requires one. + // + regMaskTP rbmAllMask; + regMaskTP rbmMskCalleeTrash; + unsigned cntCalleeTrashMask; + +public: + regMaskTP get_RBM_ALLMASK() const + { + return this->rbmAllMask; + } + regMaskTP get_RBM_MSK_CALLEE_TRASH() const + { + return this->rbmMskCalleeTrash; + } + unsigned get_CNT_CALLEE_TRASH_MASK() const + { + return this->cntCalleeTrashMask; + } +#endif // TARGET_XARCH + }; // end of class Compiler //--------------------------------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 0c1318006e18c..b241d0ddaf425 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -747,6 +747,10 @@ void emitter::emitBegCG(Compiler* comp, COMP_HANDLE cmpHandle) #if defined(TARGET_AMD64) rbmFltCalleeTrash = emitComp->rbmFltCalleeTrash; #endif // TARGET_AMD64 + +#if defined(TARGET_XARCH) + rbmMskCalleeTrash = emitComp->rbmMskCalleeTrash; +#endif // TARGET_XARCH } void emitter::emitEndCG() diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4ba00b1957762..1ec95b21c03a0 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2311,6 +2311,15 @@ class emitter } #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) + regMaskTP rbmMskCalleeTrash; + + regMaskTP get_RBM_MSK_CALLEE_TRASH() const + { + return this->rbmMskCalleeTrash; + } +#endif // TARGET_AMD64 + CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); #if defined(FEATURE_SIMD) CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 3d6aa46842621..d517d2fc63f31 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6977,7 +6977,7 @@ void emitter::emitIns_R_R_C(instruction ins, void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2) { assert(IsAvx512OrPriorInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); instrDesc* id = emitNewInstr(attr); id->idIns(ins); @@ -11557,7 +11557,7 @@ void emitter::emitDispIns( case IF_RWR_RWR_RRD: { assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); regNumber reg2 = id->idReg2(); regNumber reg3 = id->idReg3(); if (ins == INS_bextr || ins == INS_bzhi @@ -14956,7 +14956,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) instruction ins = id->idIns(); assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins)); + assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); regNumber src2 = id->idReg3(); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 66943d25d631b..f10961caa2dd0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25213,7 +25213,11 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const // bool GenTreeHWIntrinsic::OperIsEmbBroadcastCompatible() const { +#if defined(TARGET_XARCH) return HWIntrinsicInfo::IsEmbBroadcastCompatible(GetHWIntrinsicId()); +#else + return false; +#endif // TARGET_XARCH } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 01b886744ea4f..a610c6bf592e9 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -200,9 +200,13 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is a PermuteVar2x intrinsic HW_Flag_PermuteVar2x = 0x4000000, -#endif // TARGET_XARCH + // The intrinsic is an embedded broadcast compatiable intrinsic HW_Flag_EmbBroadcastCompatible = 0x8000000, + + // The intrinsic can consume or produce an AVX512 mask register + HW_Flag_WithAvx512Mask = 0x10000000, +#endif // TARGET_XARCH }; #if defined(TARGET_XARCH) @@ -580,12 +584,20 @@ struct HWIntrinsicInfo return (flags & HW_Flag_Commutative) != 0; } +#if defined(TARGET_XARCH) static bool IsEmbBroadcastCompatible(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_EmbBroadcastCompatible) != 0; } + static bool WithAvx512Mask(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_WithAvx512Mask) != 0; + } +#endif // TARGET_XARCH + static bool IsMaybeCommutative(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 77341f844ffa8..1792c3dd5c949 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1911,65 +1911,152 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_AVX512F_MoveMaskSpecial: + case NI_AVX512F_AddMask: { - op1Reg = op1->GetRegNum(); - regNumber maskReg = node->ExtractTempReg(RBM_ALLMASK); + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kaddb; + } + else if (count == 16) + { + ins = INS_kaddw; + } + else if (count == 32) + { + ins = INS_kaddd; + } + else + { + assert(count == 64); + ins = INS_kaddq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); - instruction maskIns; - instruction kmovIns; - emitAttr kmovAttr = EA_4BYTE; + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + + case NI_AVX512F_AndMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); - // TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change - // if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd - // for 256-bit vector. - switch (baseType) + if (count <= 8) { - case TYP_BYTE: - case TYP_UBYTE: - { - maskIns = INS_vpmovb2m; - kmovIns = INS_kmovq_gpr; - kmovAttr = EA_8BYTE; - break; - } + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kandb; + } + else if (count == 16) + { + ins = INS_kandw; + } + else if (count == 32) + { + ins = INS_kandd; + } + else + { + assert(count == 64); + ins = INS_kandq; + } - case TYP_SHORT: - case TYP_USHORT: - { - maskIns = INS_vpmovw2m; - kmovIns = INS_kmovd_gpr; - break; - } + op1Reg = op1->GetRegNum(); - case TYP_INT: - case TYP_UINT: - case TYP_FLOAT: - { - maskIns = INS_vpmovd2m; - kmovIns = INS_kmovw_gpr; - break; - } + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); - case TYP_DOUBLE: - case TYP_LONG: - case TYP_ULONG: - { - maskIns = INS_vpmovq2m; - kmovIns = INS_kmovb_gpr; - break; - } + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); - default: - { - unreached(); - } + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + + case NI_AVX512F_AndNotMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kandnb; + } + else if (count == 16) + { + ins = INS_kandnw; + } + else if (count == 32) + { + ins = INS_kandnd; + } + else + { + assert(count == 64); + ins = INS_kandnq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + + case NI_AVX512F_MoveMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kmovb_gpr; + attr = EA_4BYTE; + } + else if (count == 16) + { + ins = INS_kmovw_gpr; + attr = EA_4BYTE; + } + else if (count == 32) + { + ins = INS_kmovd_gpr; + attr = EA_4BYTE; + } + else + { + assert(count == 64); + ins = INS_kmovq_gpr; + attr = EA_8BYTE; } - assert(emitter::isMaskReg(maskReg)); + op1Reg = op1->GetRegNum(); + assert(emitter::isMaskReg(op1Reg)); - emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); - emit->emitIns_Mov(kmovIns, kmovAttr, targetReg, maskReg, INS_FLAGS_DONT_CARE); + emit->emitIns_Mov(ins, attr, targetReg, op1Reg, INS_FLAGS_DONT_CARE); break; } @@ -2007,6 +2094,82 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_OrMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_korb; + } + else if (count == 16) + { + ins = INS_korw; + } + else if (count == 32) + { + ins = INS_kord; + } + else + { + assert(count == 64); + ins = INS_korq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + + case NI_AVX512F_XorMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kxorb; + } + else if (count == 16) + { + ins = INS_kxorw; + } + else if (count == 32) + { + ins = INS_kxord; + } + else + { + assert(count == 64); + ins = INS_kxorq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + // Use EA_32BYTE to ensure the VEX.L bit gets set + emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg); + break; + } + case NI_AVX512F_ConvertToUInt32: case NI_AVX512F_ConvertToUInt32WithTruncation: case NI_AVX512F_X64_ConvertToUInt64: diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 1d60e326384aa..02387ef068960 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -274,7 +274,7 @@ HARDWARE_INTRINSIC(Vector512, Divide, HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAny, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(Vector512, Floor, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -828,27 +828,27 @@ HARDWARE_INTRINSIC(AVX2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, Abs, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pabsd, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareNotLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareOrdered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F, CompareUnordered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareNotLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareOrdered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareUnordered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -906,7 +906,7 @@ HARDWARE_INTRINSIC(AVX512F, Max, HARDWARE_INTRINSIC(AVX512F, Min, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pminsd, INS_pminud, INS_vpminsq, INS_vpminuq, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, Multiply, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, true, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, true, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512F, Permute2x64, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, Permute4x32, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, Permute4x64, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) @@ -944,7 +944,7 @@ HARDWARE_INTRINSIC(AVX512F, Subtract, HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags @@ -956,11 +956,11 @@ HARDWARE_INTRINSIC(AVX512F_VL, AlignRight32, HARDWARE_INTRINSIC(AVX512F_VL, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F_VL, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F_VL, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Byte, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128ByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Double, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) @@ -1017,18 +1017,18 @@ HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512BW Intrinsics HARDWARE_INTRINSIC(AVX512BW, Abs, 64, 1, true, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, true, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, true, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512BW, AddSaturate, 64, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, AlignRight, 64, 3, false, {INS_palignr, INS_palignr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512BW, Average, 64, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512BW, BlendVariable, 64, 3, true, {INS_vpblendmb, INS_vpblendmb, INS_vpblendmw, INS_vpblendmw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512BW, BlendVariable, 64, 3, true, {INS_vpblendmb, INS_vpblendmb, INS_vpblendmw, INS_vpblendmw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256Byte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256ByteWithSaturation, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256SByte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1070,11 +1070,11 @@ HARDWARE_INTRINSIC(AVX512BW, UnpackLow, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512BW.VL Intrinsics -HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128Byte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128ByteWithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128SByte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1109,8 +1109,8 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512DQ Intrinsics -HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1125,12 +1125,12 @@ HARDWARE_INTRINSIC(AVX512DQ, ExtractVector256, HARDWARE_INTRINSIC(AVX512DQ, InsertVector128, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti64x2, INS_vinserti64x2, INS_invalid, INS_vinsertf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, InsertVector256, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti32x8, INS_vinserti32x8, INS_invalid, INS_invalid, INS_vinsertf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512DQ, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512DQ, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) HARDWARE_INTRINSIC(AVX512DQ, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags @@ -1308,6 +1308,9 @@ HARDWARE_INTRINSIC(SSE41, PTEST, HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, AddMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, AndMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, AndNotMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, BlendVariableMask, -1, 3, true, {INS_vpblendmb, INS_vpblendmb, INS_vpblendmw, INS_vpblendmw, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, CompareEqualMask, -1, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanMask, -1, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) @@ -1323,7 +1326,9 @@ HARDWARE_INTRINSIC(AVX512F, CompareOrderedMask, HARDWARE_INTRINSIC(AVX512F, CompareUnorderedMask, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, ConvertMaskToVector, -1, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, ConvertVectorToMask, -1, 1, true, {INS_vpmovb2m, INS_vpmovb2m, INS_vpmovw2m, INS_vpmovw2m, INS_vpmovd2m, INS_vpmovd2m, INS_vpmovq2m, INS_vpmovq2m, INS_vpmovd2m, INS_vpmovq2m}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, MoveMask, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, OrMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index fc81d051c8e4c..2e8ba1c7cfae8 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1853,7 +1853,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpecial, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_Vector512_ExtractMostSignificantBits, + simdBaseJitType, simdSize); } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c7328a7d4c002..92c9980c84a00 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1028,6 +1028,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + if (HWIntrinsicInfo::WithAvx512Mask(intrinsicId) && comp->IsBaselineVector512IsaSupportedOpportunistically()) + { + GenTree* nextNode = LowerHWIntrinsicWithAvx512Mask(node); + + if (nextNode != nullptr) + { + return nextNode; + } + } + switch (intrinsicId) { case NI_Vector128_ConditionalSelect: @@ -1610,40 +1620,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerFusedMultiplyAdd(node); break; - case NI_AVX512F_BlendVariable: - case NI_AVX512F_CompareEqual: - case NI_AVX512F_CompareGreaterThan: - case NI_AVX512F_CompareGreaterThanOrEqual: - case NI_AVX512F_CompareLessThan: - case NI_AVX512F_CompareLessThanOrEqual: - case NI_AVX512F_CompareNotEqual: - case NI_AVX512F_CompareNotGreaterThan: - case NI_AVX512F_CompareNotGreaterThanOrEqual: - case NI_AVX512F_CompareNotLessThan: - case NI_AVX512F_CompareNotLessThanOrEqual: - case NI_AVX512F_CompareOrdered: - case NI_AVX512F_CompareUnordered: - case NI_AVX512F_VL_CompareGreaterThan: - case NI_AVX512F_VL_CompareGreaterThanOrEqual: - case NI_AVX512F_VL_CompareLessThan: - case NI_AVX512F_VL_CompareLessThanOrEqual: - case NI_AVX512F_VL_CompareNotEqual: - case NI_AVX512BW_BlendVariable: - case NI_AVX512BW_CompareEqual: - case NI_AVX512BW_CompareGreaterThan: - case NI_AVX512BW_CompareGreaterThanOrEqual: - case NI_AVX512BW_CompareLessThan: - case NI_AVX512BW_CompareLessThanOrEqual: - case NI_AVX512BW_CompareNotEqual: - case NI_AVX512BW_VL_CompareGreaterThan: - case NI_AVX512BW_VL_CompareGreaterThanOrEqual: - case NI_AVX512BW_VL_CompareLessThan: - case NI_AVX512BW_VL_CompareLessThanOrEqual: - case NI_AVX512BW_VL_CompareNotEqual: - { - return LowerHWIntrinsicWithAvx512Mask(node); - } - default: break; } @@ -1694,29 +1670,80 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // just use PTEST. We can't support it for floating-point, however, // as it has both +0.0 and -0.0 where +0.0 == -0.0 - node->Op(1) = op1; - BlockRange().Remove(op2); + bool skipReplaceOperands = false; - LIR::Use op1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(op1Use); - op1 = node->Op(1); + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); + NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); + + switch (op1IntrinsicId) + { + case NI_SSE_And: + case NI_SSE2_And: + case NI_AVX_And: + case NI_AVX2_And: + { + // We can optimize to TestZ(op1.op1, op1.op2) + + node->Op(1) = op1Intrinsic->Op(1); + node->Op(2) = op1Intrinsic->Op(2); + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + + skipReplaceOperands = true; + break; + } + + case NI_SSE_AndNot: + case NI_SSE2_AndNot: + case NI_AVX_AndNot: + case NI_AVX2_AndNot: + { + // We can optimize to TestC(op1.op1, op1.op2) + cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; + + node->Op(1) = op1Intrinsic->Op(1); + node->Op(2) = op1Intrinsic->Op(2); + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + + skipReplaceOperands = true; + break; + } + + default: + { + break; + } + } + } - op2 = comp->gtClone(op1); - BlockRange().InsertAfter(op1, op2); - node->Op(2) = op2; + if (!skipReplaceOperands) + { + // Default handler, emit a TestZ(op1, op1) + + node->Op(1) = op1; + BlockRange().Remove(op2); + + LIR::Use op1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(op1Use); + op1 = node->Op(1); + + op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + node->Op(2) = op2; + } if (simdSize == 32) { - // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed? - node->ChangeHWIntrinsicId(NI_AVX_TestZ); LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd); } else { assert(simdSize == 16); - - // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed? - node->ChangeHWIntrinsicId(NI_SSE41_TestZ); LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd); } @@ -5070,21 +5097,57 @@ GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) var_types simdBaseType = node->GetSimdBaseType(); unsigned simdSize = node->GetSimdSize(); var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + size_t numArgs = node->GetOperandCount(); assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); assert(simdSize != 0); NamedIntrinsic maskIntrinsicId = NI_Illegal; - GenTree** maskOperand = nullptr; + GenTree** maskOperand1 = nullptr; + GenTree** maskOperand2 = nullptr; switch (intrinsicId) { + case NI_Vector512_ExtractMostSignificantBits: + { + maskIntrinsicId = NI_AVX512F_MoveMask; + maskOperand1 = &node->Op(1); + break; + } + + case NI_AVX512F_Add: + case NI_AVX512BW_Add: + { + maskIntrinsicId = NI_AVX512F_AddMask; + maskOperand1 = &node->Op(1); + maskOperand2 = &node->Op(2); + break; + } + + case NI_AVX512F_And: + case NI_AVX512DQ_And: + { + maskIntrinsicId = NI_AVX512F_AndMask; + maskOperand1 = &node->Op(1); + maskOperand2 = &node->Op(2); + break; + } + + case NI_AVX512F_AndNot: + case NI_AVX512DQ_AndNot: + { + maskIntrinsicId = NI_AVX512F_AndNotMask; + maskOperand1 = &node->Op(1); + maskOperand2 = &node->Op(2); + break; + } + case NI_AVX512F_BlendVariable: case NI_AVX512BW_BlendVariable: { maskIntrinsicId = NI_AVX512F_BlendVariableMask; - maskOperand = &node->Op(3); + maskOperand1 = &node->Op(3); break; } @@ -5201,6 +5264,24 @@ GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_Or: + case NI_AVX512DQ_Or: + { + maskIntrinsicId = NI_AVX512F_OrMask; + maskOperand1 = &node->Op(1); + maskOperand2 = &node->Op(2); + break; + } + + case NI_AVX512F_Xor: + case NI_AVX512DQ_Xor: + { + maskIntrinsicId = NI_AVX512F_XorMask; + maskOperand1 = &node->Op(1); + maskOperand2 = &node->Op(2); + break; + } + default: { unreached(); @@ -5208,27 +5289,59 @@ GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) } assert(maskIntrinsicId != NI_Illegal); - node->ChangeHWIntrinsicId(maskIntrinsicId); + bool insertMaskToVector = false; - if (maskOperand != nullptr) + if (maskOperand1 != nullptr) { - GenTree* maskOp = *maskOperand; + GenTree* maskOp1 = *maskOperand1; + GenTree* maskOp2 = (maskOperand2 != nullptr) ? *maskOperand2 : nullptr; - if (maskOp->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector)) + GenTree* op1 = *maskOperand1; + + if (maskOp1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) && + (genTypeSize(maskOp1->AsHWIntrinsic()->GetSimdBaseType()) == genTypeSize(simdBaseType))) { - GenTreeHWIntrinsic* maskToVector = maskOp->AsHWIntrinsic(); - *maskOperand = maskToVector->Op(1); - BlockRange().Remove(maskOp); + if (maskOp2 != nullptr) + { + if (!maskOp2->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) || + (genTypeSize(maskOp2->AsHWIntrinsic()->GetSimdBaseType()) != genTypeSize(simdBaseType))) + { + // We don't want to handle this case, so return + return nullptr; + } + + GenTreeHWIntrinsic* maskToVector2 = maskOp2->AsHWIntrinsic(); + *maskOperand2 = maskToVector2->Op(1); + BlockRange().Remove(maskOp2); + + insertMaskToVector = true; + } + + GenTreeHWIntrinsic* maskToVector1 = maskOp1->AsHWIntrinsic(); + *maskOperand1 = maskToVector1->Op(1); + BlockRange().Remove(maskOp1); } - else + else if (maskOp2 == nullptr) { - GenTree* vectorToMask = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskOp, NI_AVX512F_ConvertVectorToMask, + assert((maskIntrinsicId == NI_AVX512F_MoveMask) || (maskIntrinsicId == NI_AVX512F_BlendVariableMask)); + + GenTree* vectorToMask = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskOp1, NI_AVX512F_ConvertVectorToMask, simdBaseJitType, simdSize); - BlockRange().InsertAfter(maskOp, vectorToMask); - *maskOperand = vectorToMask; + BlockRange().InsertAfter(maskOp1, vectorToMask); + *maskOperand1 = vectorToMask; + } + else + { + // We don't want to handle this case, so return + return nullptr; } } else + { + insertMaskToVector = true; + } + + if (insertMaskToVector) { node->gtType = TYP_MASK; @@ -5242,6 +5355,7 @@ GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) } } + node->ChangeHWIntrinsicId(maskIntrinsicId); return LowerNode(node); } diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index b33bd2ab32523..ee81f7d815d66 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -708,13 +708,15 @@ LinearScan::LinearScan(Compiler* theCompiler) availableRegCount = ACTUAL_REG_COUNT; needNonIntegerRegisters = false; -#if defined(TARGET_XARCH) - #if defined(TARGET_AMD64) rbmAllFloat = compiler->rbmAllFloat; rbmFltCalleeTrash = compiler->rbmFltCalleeTrash; #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) + rbmAllMask = compiler->rbmAllMask; + rbmMskCalleeTrash = compiler->rbmMskCalleeTrash; + if (!compiler->canUseEvexEncoding()) { availableRegCount -= CNT_HIGHFLOAT; diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 6d7a548722ac2..bf32b41bc1d2b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -72,14 +72,6 @@ inline bool useFloatReg(var_types type) return (regType(type) == FloatRegisterType); } -//------------------------------------------------------------------------ -// registerTypesEquivalent: Check to see if two RegisterTypes are equivalent -// -inline bool registerTypesEquivalent(RegisterType a, RegisterType b) -{ - return varTypeIsIntegralOrI(a) == varTypeIsIntegralOrI(b); -} - //------------------------------------------------------------------------ // RefInfo: Captures the necessary information for a definition that is "in-flight" // during `buildIntervals` (i.e. a tree-node definition has been encountered, @@ -2031,8 +2023,6 @@ class LinearScan : public LinearScanInterface #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); -#if defined(TARGET_XARCH) - #if defined(TARGET_AMD64) regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; @@ -2047,6 +2037,18 @@ class LinearScan : public LinearScanInterface } #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) + regMaskTP rbmAllMask; + regMaskTP rbmMskCalleeTrash; + + regMaskTP get_RBM_ALLMASK() const + { + return this->rbmAllMask; + } + regMaskTP get_RBM_MSK_CALLEE_TRASH() const + { + return this->rbmMskCalleeTrash; + } #endif // TARGET_XARCH unsigned availableRegCount; @@ -2064,7 +2066,21 @@ class LinearScan : public LinearScanInterface // static regMaskTP calleeSaveRegs(RegisterType rt) { - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; + if (varTypeIsIntegralOrI(rt)) + { + return RBM_INT_CALLEE_SAVED; + } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (varTypeIsMask(rt)) + { + return RBM_MSK_CALLEE_SAVED; + } +#endif // TARGET_XARCH && FEATURE_SIMD + else + { + assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt)); + return RBM_FLT_CALLEE_SAVED; + } } //------------------------------------------------------------------------ @@ -2072,7 +2088,21 @@ class LinearScan : public LinearScanInterface // regMaskTP callerSaveRegs(RegisterType rt) const { - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; + if (varTypeIsIntegralOrI(rt)) + { + return RBM_INT_CALLEE_TRASH; + } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (varTypeIsMask(rt)) + { + return RBM_MSK_CALLEE_TRASH; + } +#endif // TARGET_XARCH && FEATURE_SIMD + else + { + assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt)); + return RBM_FLT_CALLEE_TRASH; + } } }; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 0e209c589103a..b44b8339482dd 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1923,6 +1923,11 @@ static const regNumber lsraRegOrderFltEvex[] = {REG_VAR_ORDER_FLT_EVEX}; const unsigned lsraRegOrderFltEvexSize = ArrLen(lsraRegOrderFltEvex); #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) +static const regNumber lsraRegOrderMsk[] = {REG_VAR_ORDER_MSK}; +const unsigned lsraRegOrderMskSize = ArrLen(lsraRegOrderMsk); +#endif // TARGET_XARCH + //------------------------------------------------------------------------ // buildPhysRegRecords: Make an interval for each physical register // @@ -1978,6 +1983,20 @@ void LinearScan::buildPhysRegRecords() RegRecord* curr = &physRegs[reg]; curr->regOrder = (unsigned char)i; } + +#if defined(TARGET_XARCH) + // xarch has mask registers available when EVEX is supported + + if (compiler->canUseEvexEncoding()) + { + for (unsigned int i = 0; i < lsraRegOrderMskSize; i++) + { + regNumber reg = lsraRegOrderMsk[i]; + RegRecord* curr = &physRegs[reg]; + curr->regOrder = (unsigned char)i; + } + } +#endif // TARGET_XARCH } //------------------------------------------------------------------------ @@ -2715,6 +2734,12 @@ void LinearScan::buildIntervals() { calleeSaveCount = CNT_CALLEE_ENREG; } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (varTypeUsesMaskReg(interval->registerType)) + { + calleeSaveCount = CNT_CALLEE_SAVED_MASK; + } +#endif // TARGET_XARCH && FEATURE_SIMD else { assert(varTypeUsesFloatReg(interval->registerType)); @@ -3968,6 +3993,12 @@ int LinearScan::BuildReturn(GenTree* tree) { buildInternalIntRegisterDefForNode(tree, dstRegMask); } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (varTypeUsesMaskReg(dstType)) + { + buildInternalMaskRegisterDefForNode(tree, dstRegMask); + } +#endif // TARGET_XARCH && FEATURE_SIMD else { assert(varTypeUsesFloatReg(dstType)); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 0e5c929e933b4..5d54c08ebb790 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2580,16 +2580,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } - case NI_AVX512F_MoveMaskSpecial: - { - srcCount += BuildOperandUses(op1); - buildInternalMaskRegisterDefForNode(intrinsicTree); - setInternalRegsDelayFree = true; - - buildUses = false; - break; - } - default: { assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index aada87eccfaf2..37e35481da264 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -92,7 +92,9 @@ #define REG_MASK_FIRST REG_K0 #define REG_MASK_LAST REG_K7 - #define RBM_ALLMASK RBM_K1 + #define RBM_ALLMASK_INIT (0) + #define RBM_ALLMASK_EVEX (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #define RBM_ALLMASK get_RBM_ALLMASK() #define CNT_MASK_REGS 8 @@ -155,17 +157,20 @@ #define RBM_FLT_CALLEE_TRASH get_RBM_FLT_CALLEE_TRASH() + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_MSK_CALLEE_TRASH_INIT (0) + #define RBM_MSK_CALLEE_TRASH_EVEX (0) + #define RBM_MSK_CALLEE_SAVED (0) - #define RBM_MSK_CALLEE_TRASH RBM_ALLMASK + #define RBM_MSK_CALLEE_TRASH get_RBM_MSK_CALLEE_TRASH() #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 - // TODO-AVX512: Add RBM_MSK_CALLEE_* - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) - #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) + #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH | RBM_MSK_CALLEE_TRASH) + #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED | RBM_MSK_CALLEE_SAVED) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -285,6 +290,7 @@ #define REG_VAR_ORDER REG_VAR_ORDER_CALLEE_TRASH,REG_VAR_ORDER_CALLEE_SAVED #define REG_VAR_ORDER_FLT REG_VAR_ORDER_FLT_CALLEE_TRASH,REG_VAR_ORDER_FLT_CALLEE_SAVED #define REG_VAR_ORDER_FLT_EVEX REG_VAR_ORDER_FLT_EVEX_CALLEE_TRASH,REG_VAR_ORDER_FLT_EVEX_CALLEE_SAVED +#define REG_VAR_ORDER_MSK REG_K1,REG_K2,REG_K3,REG_K4,REG_K5,REG_K6,REG_K7 #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) @@ -312,6 +318,12 @@ #define CNT_CALLEE_TRASH_FLOAT get_CNT_CALLEE_TRASH_FLOAT() + #define CNT_CALLEE_SAVED_MASK (0) + + #define CNT_CALLEE_TRASH_MASK_INIT (0) + #define CNT_CALLEE_TRASH_MASK_EVEX (7) + #define CNT_CALLEE_TRASH_MASK get_CNT_CALLEE_TRASH_MASK() + #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) #define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16) diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index e6916ed952001..60b2f7793f435 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -92,11 +92,9 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT -#if !defined(UNIX_X86_ABI) - #define RBM_ALLMASK RBM_K1 -#else - #define RBM_ALLMASK (0) -#endif + #define RBM_ALLMASK_INIT (0) + #define RBM_ALLMASK_EVEX (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #define RBM_ALLMASK get_RBM_ALLMASK() #define CNT_HIGHFLOAT 0 @@ -105,12 +103,17 @@ #define RBM_FLT_CALLEE_SAVED RBM_NONE #define RBM_FLT_CALLEE_TRASH RBM_ALLFLOAT #define REG_VAR_ORDER_FLT REG_XMM0, REG_XMM1, REG_XMM2, REG_XMM3, REG_XMM4, REG_XMM5, REG_XMM6, REG_XMM7 + #define REG_VAR_ORDER_MSK REG_K1,REG_K2,REG_K3,REG_K4,REG_K5,REG_K6,REG_K7 #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM7 + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_MSK_CALLEE_TRASH_INIT (0) + #define RBM_MSK_CALLEE_TRASH_EVEX RBM_ALLMASK_EVEX + #define RBM_MSK_CALLEE_SAVED (0) - #define RBM_MSK_CALLEE_TRASH RBM_ALLMASK + #define RBM_MSK_CALLEE_TRASH get_RBM_MSK_CALLEE_TRASH() #define XMM_REGSIZE_BYTES 16 // XMM register size in bytes #define YMM_REGSIZE_BYTES 32 // YMM register size in bytes @@ -133,9 +136,8 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX) - // TODO-AVX512: Add RBM_MSK_CALLEE_* - #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) - #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED | RBM_MSK_CALLEE_SAVED) + #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH | RBM_MSK_CALLEE_TRASH) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -153,6 +155,12 @@ #define CNT_CALLEE_SAVED_FLOAT (0) #define CNT_CALLEE_TRASH_FLOAT (6) + #define CNT_CALLEE_SAVED_MASK (0) + + #define CNT_CALLEE_TRASH_MASK_INIT (0) + #define CNT_CALLEE_TRASH_MASK_EVEX (7) + #define CNT_CALLEE_TRASH_MASK get_CNT_CALLEE_TRASH_MASK() + #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) // EBX,ESI,EDI,EBP #define REG_LNGARG_LO REG_EAX From 2452f493f333695dcbc65d5b572aa7fdebc9f196 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 18 Jul 2023 08:59:27 -0700 Subject: [PATCH 02/16] Apply formatting patch --- src/coreclr/jit/lowerxarch.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 92c9980c84a00..2e8c34dfda5ff 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -5305,10 +5305,10 @@ GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) { if (!maskOp2->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) || (genTypeSize(maskOp2->AsHWIntrinsic()->GetSimdBaseType()) != genTypeSize(simdBaseType))) - { - // We don't want to handle this case, so return - return nullptr; - } + { + // We don't want to handle this case, so return + return nullptr; + } GenTreeHWIntrinsic* maskToVector2 = maskOp2->AsHWIntrinsic(); *maskOperand2 = maskToVector2->Op(1); From 28dda4a1fe2968f8211817c49f2dbf2d06be19df Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 18 Jul 2023 09:17:42 -0700 Subject: [PATCH 03/16] Fix an assert to include TYP_STRUCT --- src/coreclr/jit/lsra.h | 14 ++++++++++++-- src/coreclr/jit/lsrabuild.cpp | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index bf32b41bc1d2b..0a2549342708b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2066,6 +2066,11 @@ class LinearScan : public LinearScanInterface // static regMaskTP calleeSaveRegs(RegisterType rt) { + // These should potentially use varTypeUses*Reg instead + // + // However, currently TYP_STRUCT hits the float path and + // this may have unexpected consequences if we change it + if (varTypeIsIntegralOrI(rt)) { return RBM_INT_CALLEE_SAVED; @@ -2078,7 +2083,7 @@ class LinearScan : public LinearScanInterface #endif // TARGET_XARCH && FEATURE_SIMD else { - assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt)); + assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt) || varTypeIsStruct(rt)); return RBM_FLT_CALLEE_SAVED; } } @@ -2088,6 +2093,11 @@ class LinearScan : public LinearScanInterface // regMaskTP callerSaveRegs(RegisterType rt) const { + // These should potentially use varTypeUses*Reg instead + // + // However, currently TYP_STRUCT hits the float path and + // this may have unexpected consequences if we change it + if (varTypeIsIntegralOrI(rt)) { return RBM_INT_CALLEE_TRASH; @@ -2100,7 +2110,7 @@ class LinearScan : public LinearScanInterface #endif // TARGET_XARCH && FEATURE_SIMD else { - assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt)); + assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt) || varTypeIsStruct(rt)); return RBM_FLT_CALLEE_TRASH; } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index b44b8339482dd..62ad28f2c3028 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -849,7 +849,7 @@ regMaskTP LinearScan::getKillSetForModDiv(GenTreeOp* node) regMaskTP killMask = RBM_NONE; #ifdef TARGET_XARCH assert(node->OperIs(GT_MOD, GT_DIV, GT_UMOD, GT_UDIV)); - if (!varTypeIsFloating(node->TypeGet())) + if (varTypeUsesIntReg(node->TypeGet())) { // Both RAX and RDX are killed by the operation killMask = RBM_RAX | RBM_RDX; @@ -4319,7 +4319,7 @@ int LinearScan::BuildCmpOperands(GenTree* tree) bool needByteRegs = false; if (varTypeIsByte(tree)) { - if (!varTypeIsFloating(op1)) + if (varTypeUsesIntReg(op1)) { needByteRegs = true; } From 47827c4db11153943c805d387eef40c3f3f3ecaa Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 18 Jul 2023 21:45:18 -0700 Subject: [PATCH 04/16] Ensure kmask registers aren't in the default killset --- src/coreclr/jit/lsra.h | 22 ++++++---------------- src/coreclr/jit/lsrabuild.cpp | 6 +++++- src/coreclr/jit/targetamd64.h | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 0a2549342708b..bc2660e57ba99 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2066,24 +2066,19 @@ class LinearScan : public LinearScanInterface // static regMaskTP calleeSaveRegs(RegisterType rt) { - // These should potentially use varTypeUses*Reg instead - // - // However, currently TYP_STRUCT hits the float path and - // this may have unexpected consequences if we change it - - if (varTypeIsIntegralOrI(rt)) + if (varTypeUsesIntReg(rt)) { return RBM_INT_CALLEE_SAVED; } #if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (varTypeIsMask(rt)) + else if (varTypeUsesMaskReg(rt)) { return RBM_MSK_CALLEE_SAVED; } #endif // TARGET_XARCH && FEATURE_SIMD else { - assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt) || varTypeIsStruct(rt)); + assert(varTypeUsesFloatReg(rt)); return RBM_FLT_CALLEE_SAVED; } } @@ -2093,24 +2088,19 @@ class LinearScan : public LinearScanInterface // regMaskTP callerSaveRegs(RegisterType rt) const { - // These should potentially use varTypeUses*Reg instead - // - // However, currently TYP_STRUCT hits the float path and - // this may have unexpected consequences if we change it - - if (varTypeIsIntegralOrI(rt)) + if (varTypeUsesIntReg(rt)) { return RBM_INT_CALLEE_TRASH; } #if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (varTypeIsMask(rt)) + else if (varTypeUsesMaskReg(rt)) { return RBM_MSK_CALLEE_TRASH; } #endif // TARGET_XARCH && FEATURE_SIMD else { - assert(varTypeIsFloating(rt) || varTypeIsSIMD(rt) || varTypeIsStruct(rt)); + assert(varTypeUsesFloatReg(rt)); return RBM_FLT_CALLEE_TRASH; } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 62ad28f2c3028..91a9b8f59a7a8 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -892,6 +892,10 @@ regMaskTP LinearScan::getKillSetForCall(GenTreeCall* call) if (!compiler->compFloatingPointUsed) { killMask &= ~RBM_FLT_CALLEE_TRASH; + +#if defined(TARGET_XARCH) + killMask &= ~RBM_MSK_CALLEE_TRASH; +#endif // TARGET_XARCH } #ifdef TARGET_ARM if (call->IsVirtualStub()) @@ -1210,7 +1214,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo // If there are no callee-saved registers, the call could kill all the registers. // This is a valid state, so in that case assert should not trigger. The RA will spill in order // to free a register later. - assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE)); + assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE) || varTypeIsStruct(varDsc->lvType)); } } } diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 37e35481da264..6d3f92fb6da90 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -159,7 +159,7 @@ /* NOTE: Sync with variable name defined in compiler.h */ #define RBM_MSK_CALLEE_TRASH_INIT (0) - #define RBM_MSK_CALLEE_TRASH_EVEX (0) + #define RBM_MSK_CALLEE_TRASH_EVEX RBM_ALLMASK_EVEX #define RBM_MSK_CALLEE_SAVED (0) #define RBM_MSK_CALLEE_TRASH get_RBM_MSK_CALLEE_TRASH() From 442e26d1abced6191b329e262744cae36db0d7c0 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 19 Jul 2023 07:29:12 -0700 Subject: [PATCH 05/16] Apply formatting patch --- src/coreclr/jit/lsrabuild.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 91a9b8f59a7a8..12aad1c8c77de 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1214,7 +1214,8 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo // If there are no callee-saved registers, the call could kill all the registers. // This is a valid state, so in that case assert should not trigger. The RA will spill in order // to free a register later. - assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE) || varTypeIsStruct(varDsc->lvType)); + assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE) || + varTypeIsStruct(varDsc->lvType)); } } } From d1ef0eaba275f7fa55e831e2c6ccde5cb8aa4b8f Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 19 Jul 2023 19:57:19 -0700 Subject: [PATCH 06/16] Move the kmask optimizations up to morph --- src/coreclr/jit/emitxarch.cpp | 14 + src/coreclr/jit/gentree.cpp | 132 ++----- src/coreclr/jit/hwintrinsic.h | 9 - src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 60 +++- src/coreclr/jit/hwintrinsiclistxarch.h | 94 ++--- src/coreclr/jit/hwintrinsicxarch.cpp | 216 +++++++++++- src/coreclr/jit/instrsxarch.h | 8 + src/coreclr/jit/lower.h | 1 - src/coreclr/jit/lowerxarch.cpp | 370 +------------------- src/coreclr/jit/morph.cpp | 160 ++++++++- 10 files changed, 538 insertions(+), 526 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index d517d2fc63f31..e46ea5c2d9ec2 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -19172,6 +19172,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_6C : PERFSCORE_LATENCY_4C; break; + case INS_vptestmb: + case INS_vptestmd: + case INS_vptestmq: + case INS_vptestmw: + case INS_vptestnmb: + case INS_vptestnmd: + case INS_vptestnmq: + case INS_vptestnmw: + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_LATENCY_4C; + break; + } + case INS_mpsadbw: result.insThroughput = PERFSCORE_THROUGHPUT_2C; result.insLatency += PERFSCORE_LATENCY_4C; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 891ce9537478b..b8302609bbd34 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20605,7 +20605,8 @@ GenTree* Compiler::gtNewSimdCmpOpNode( var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); - NamedIntrinsic intrinsic = NI_Illegal; + NamedIntrinsic intrinsic = NI_Illegal; + bool needsConvertMaskToVector = false; switch (op) { @@ -20615,15 +20616,8 @@ GenTree* Compiler::gtNewSimdCmpOpNode( if (simdSize == 64) { assert(IsBaselineVector512IsaSupportedDebugOnly()); - - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_CompareEqual; - } - else - { - intrinsic = NI_AVX512F_CompareEqual; - } + intrinsic = NI_AVX512F_CompareEqualMask; + needsConvertMaskToVector = true; } else if (simdSize == 32) { @@ -20686,31 +20680,10 @@ GenTree* Compiler::gtNewSimdCmpOpNode( { if (IsBaselineVector512IsaSupportedOpportunistically()) { - if (simdSize == 64) - { - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_CompareGreaterThanOrEqual; - } - else - { - intrinsic = NI_AVX512F_CompareGreaterThanOrEqual; - } - break; - } - else if (!varTypeIsFloating(simdBaseType)) + if ((simdSize == 64) || !varTypeIsFloating(simdBaseType)) { - assert((simdSize == 16) || (simdSize == 32)); - - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_VL_CompareGreaterThanOrEqual; - } - else - { - intrinsic = NI_AVX512F_VL_CompareGreaterThanOrEqual; - } - + intrinsic = NI_AVX512F_CompareGreaterThanOrEqualMask; + needsConvertMaskToVector = true; break; } } @@ -20781,31 +20754,10 @@ GenTree* Compiler::gtNewSimdCmpOpNode( { if (IsBaselineVector512IsaSupportedOpportunistically()) { - if (simdSize == 64) - { - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_CompareGreaterThan; - } - else - { - intrinsic = NI_AVX512F_CompareGreaterThan; - } - break; - } - else if (varTypeIsUnsigned(simdBaseType)) + if ((simdSize == 64) || varTypeIsUnsigned(simdBaseType)) { - assert((simdSize == 16) || (simdSize == 32)); - - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_VL_CompareGreaterThan; - } - else - { - intrinsic = NI_AVX512F_VL_CompareGreaterThan; - } - + intrinsic = NI_AVX512F_CompareGreaterThanMask; + needsConvertMaskToVector = true; break; } } @@ -20983,31 +20935,10 @@ GenTree* Compiler::gtNewSimdCmpOpNode( { if (IsBaselineVector512IsaSupportedOpportunistically()) { - if (simdSize == 64) - { - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_CompareLessThanOrEqual; - } - else - { - intrinsic = NI_AVX512F_CompareLessThanOrEqual; - } - break; - } - else if (!varTypeIsFloating(simdBaseType)) + if ((simdSize == 64) || !varTypeIsFloating(simdBaseType)) { - assert((simdSize == 16) || (simdSize == 32)); - - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_VL_CompareLessThanOrEqual; - } - else - { - intrinsic = NI_AVX512F_VL_CompareLessThanOrEqual; - } - + intrinsic = NI_AVX512F_CompareLessThanOrEqualMask; + needsConvertMaskToVector = true; break; } } @@ -21078,31 +21009,10 @@ GenTree* Compiler::gtNewSimdCmpOpNode( { if (IsBaselineVector512IsaSupportedOpportunistically()) { - if (simdSize == 64) + if ((simdSize == 64) || varTypeIsUnsigned(simdBaseType)) { - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_CompareLessThan; - } - else - { - intrinsic = NI_AVX512F_CompareLessThan; - } - break; - } - else if (varTypeIsUnsigned(simdBaseType)) - { - assert((simdSize == 16) || (simdSize == 32)); - - if (varTypeIsSmall(simdBaseType)) - { - intrinsic = NI_AVX512BW_VL_CompareLessThan; - } - else - { - intrinsic = NI_AVX512F_VL_CompareLessThan; - } - + intrinsic = NI_AVX512F_CompareLessThanMask; + needsConvertMaskToVector = true; break; } } @@ -21356,7 +21266,15 @@ GenTree* Compiler::gtNewSimdCmpOpNode( assert(intrinsic != NI_Illegal); #if defined(TARGET_XARCH) - return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); + if (needsConvertMaskToVector) + { + GenTree* retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + } + else + { + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); + } #else return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); #endif diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index a610c6bf592e9..11e8c376a1a73 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -203,9 +203,6 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is an embedded broadcast compatiable intrinsic HW_Flag_EmbBroadcastCompatible = 0x8000000, - - // The intrinsic can consume or produce an AVX512 mask register - HW_Flag_WithAvx512Mask = 0x10000000, #endif // TARGET_XARCH }; @@ -590,12 +587,6 @@ struct HWIntrinsicInfo HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_EmbBroadcastCompatible) != 0; } - - static bool WithAvx512Mask(NamedIntrinsic id) - { - HWIntrinsicFlag flags = lookupFlags(id); - return (flags & HW_Flag_WithAvx512Mask) != 0; - } #endif // TARGET_XARCH static bool IsMaybeCommutative(NamedIntrinsic id) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 1792c3dd5c949..8f10f828c0957 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -2062,35 +2062,77 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_KORTEST: { - op1Reg = op1->GetRegNum(); - uint32_t simdSize = node->GetSimdSize(); uint32_t count = simdSize / genTypeSize(baseType); - instruction testIns; + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kortestb; + } + else if (count == 16) + { + ins = INS_kortestw; + } + else if (count == 32) + { + ins = INS_kortestd; + } + else + { + assert(count == 64); + ins = INS_kortestq; + } + + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op1Reg; + + if (node->GetOperandCount() == 2) + { + GenTree* op2 = node->Op(2); + op2Reg = op2->GetRegNum(); + } + + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); + + emit->emitIns_R_R(ins, EA_8BYTE, op1Reg, op1Reg); + break; + } + + case NI_AVX512F_KTEST: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); if (count <= 8) { - testIns = INS_kortestb; + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_ktestb; } else if (count == 16) { - testIns = INS_kortestw; + ins = INS_ktestw; } else if (count == 32) { - testIns = INS_kortestd; + ins = INS_ktestd; } else { assert(count == 64); - testIns = INS_kortestq; + ins = INS_ktestq; } - assert(testIns != INS_invalid); + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + regNumber op2Reg = op2->GetRegNum(); + assert(emitter::isMaskReg(op1Reg)); + assert(emitter::isMaskReg(op2Reg)); - emit->emitIns_R_R(testIns, EA_8BYTE, op1Reg, op1Reg); + emit->emitIns_R_R(ins, EA_8BYTE, op1Reg, op1Reg); break; } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 02387ef068960..20c3868012373 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -274,7 +274,7 @@ HARDWARE_INTRINSIC(Vector512, Divide, HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAny, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Floor, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -828,27 +828,27 @@ HARDWARE_INTRINSIC(AVX2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, Abs, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pabsd, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendmd, INS_vpblendmd, INS_vpblendmq, INS_vpblendmq, INS_vblendmps, INS_vblendmpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_invalid, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareNotLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareOrdered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F, CompareUnordered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, CompareEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpeqd, INS_vpcmpeqd, INS_vpcmpeqq, INS_vpcmpeqq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpgtd, INS_vpcmpud, INS_vpcmpgtq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareNotGreaterThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareNotLessThan, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanOrEqual, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareOrdered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F, CompareUnordered, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -906,7 +906,7 @@ HARDWARE_INTRINSIC(AVX512F, Max, HARDWARE_INTRINSIC(AVX512F, Min, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pminsd, INS_pminud, INS_vpminsq, INS_vpminuq, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, Multiply, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, true, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, true, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, Permute2x64, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, Permute4x32, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, Permute4x64, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) @@ -944,7 +944,7 @@ HARDWARE_INTRINSIC(AVX512F, Subtract, HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags @@ -956,11 +956,11 @@ HARDWARE_INTRINSIC(AVX512F_VL, AlignRight32, HARDWARE_INTRINSIC(AVX512F_VL, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F_VL, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F_VL, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThan, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareLessThanOrEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512F_VL, CompareNotEqual, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Byte, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128ByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_VL, ConvertToVector128Double, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) @@ -1017,18 +1017,18 @@ HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512BW Intrinsics HARDWARE_INTRINSIC(AVX512BW, Abs, 64, 1, true, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, true, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, Add, 64, 2, true, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, AddSaturate, 64, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, AlignRight, 64, 3, false, {INS_palignr, INS_palignr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512BW, Average, 64, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512BW, BlendVariable, 64, 3, true, {INS_vpblendmb, INS_vpblendmb, INS_vpblendmw, INS_vpblendmw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, BlendVariable, 64, 3, true, {INS_vpblendmb, INS_vpblendmb, INS_vpblendmw, INS_vpblendmw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, true, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW, CompareEqual, 64, 2, true, {INS_vpcmpeqb, INS_vpcmpeqb, INS_vpcmpeqw, INS_vpcmpeqw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThan, 64, 2, true, {INS_vpcmpgtb, INS_vpcmpub, INS_vpcmpgtw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareGreaterThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThan, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareLessThanOrEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW, CompareNotEqual, 64, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256Byte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256ByteWithSaturation, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW, ConvertToVector256SByte, 64, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1070,11 +1070,11 @@ HARDWARE_INTRINSIC(AVX512BW, UnpackLow, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512BW.VL Intrinsics -HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoCodeGen|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(AVX512BW_VL, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128Byte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128ByteWithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512BW_VL, ConvertToVector128SByte, -1, 1, false, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1109,8 +1109,8 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512DQ Intrinsics -HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_WithAvx512Mask) -HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) @@ -1125,12 +1125,12 @@ HARDWARE_INTRINSIC(AVX512DQ, ExtractVector256, HARDWARE_INTRINSIC(AVX512DQ, InsertVector128, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti64x2, INS_vinserti64x2, INS_invalid, INS_vinsertf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, InsertVector256, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti32x8, INS_vinserti32x8, INS_invalid, INS_invalid, INS_vinsertf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX512DQ, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512DQ, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512DQ, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_WithAvx512Mask) +HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags @@ -1306,7 +1306,10 @@ HARDWARE_INTRINSIC(SSE2, COMISD, HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, false, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, KTEST, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, PTESTM, 0, 2, false, {INS_vptestmb, INS_vptestmb, INS_vptestmw, INS_vptestmw, INS_vptestmd, INS_vptestmd, INS_vptestmq, INS_vptestmq, INS_vptestmd, INS_vptestmq}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, PTESTNM, 0, 2, false, {INS_vptestnmb, INS_vptestnmb, INS_vptestnmw, INS_vptestnmw, INS_vptestnmd, INS_vptestnmd, INS_vptestnmq, INS_vptestnmq, INS_vptestnmd, INS_vptestnmq}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, AddMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, AndMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) @@ -1324,9 +1327,12 @@ HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanMask, HARDWARE_INTRINSIC(AVX512F, CompareNotLessThanOrEqualMask, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, CompareOrderedMask, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, CompareUnorderedMask, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX512F, ConvertMaskToVector, -1, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX512F, ConvertVectorToMask, -1, 1, true, {INS_vpmovb2m, INS_vpmovb2m, INS_vpmovw2m, INS_vpmovw2m, INS_vpmovd2m, INS_vpmovd2m, INS_vpmovq2m, INS_vpmovq2m, INS_vpmovd2m, INS_vpmovq2m}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, ConvertMaskToVector, -1, 1, true, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, ConvertVectorToMask, -1, 1, true, {INS_vpmovb2m, INS_vpmovb2m, INS_vpmovw2m, INS_vpmovw2m, INS_vpmovd2m, INS_vpmovd2m, INS_vpmovq2m, INS_vpmovq2m, INS_vpmovd2m, INS_vpmovq2m}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, MoveMask, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(AVX512F, NotMask, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, op_EqualityMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, op_InequalityMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, OrMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512F, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index c24f4c2db9702..488f65b5ac008 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1852,9 +1852,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, var_types simdType = getSIMDTypeForSize(simdSize); op1 = impSIMDPopStack(); + op1 = + gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_AVX512F_ConvertVectorToMask, simdBaseJitType, simdSize); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_Vector512_ExtractMostSignificantBits, - simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMask, simdBaseJitType, simdSize); } break; } @@ -3911,6 +3912,217 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_AVX512F_BlendVariable: + case NI_AVX512BW_BlendVariable: + { + assert(sig->numArgs == 3); + + op3 = impSIMDPopStack(); + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + op3 = gtNewSimdHWIntrinsicNode(TYP_MASK, op3, NI_AVX512F_ConvertVectorToMask, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, NI_AVX512F_BlendVariableMask, simdBaseJitType, + simdSize); + break; + } + + case NI_AVX512F_CompareEqual: + case NI_AVX512BW_CompareEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = + gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareEqualMask, simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareGreaterThan: + case NI_AVX512F_VL_CompareGreaterThan: + case NI_AVX512BW_CompareGreaterThan: + case NI_AVX512BW_VL_CompareGreaterThan: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareGreaterThanMask, simdBaseJitType, + simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareGreaterThanOrEqual: + case NI_AVX512F_VL_CompareGreaterThanOrEqual: + case NI_AVX512BW_CompareGreaterThanOrEqual: + case NI_AVX512BW_VL_CompareGreaterThanOrEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareGreaterThanOrEqualMask, + simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareLessThan: + case NI_AVX512F_VL_CompareLessThan: + case NI_AVX512BW_CompareLessThan: + case NI_AVX512BW_VL_CompareLessThan: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = + gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareLessThanMask, simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareLessThanOrEqual: + case NI_AVX512F_VL_CompareLessThanOrEqual: + case NI_AVX512BW_CompareLessThanOrEqual: + case NI_AVX512BW_VL_CompareLessThanOrEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareLessThanOrEqualMask, + simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareNotEqual: + case NI_AVX512F_VL_CompareNotEqual: + case NI_AVX512BW_CompareNotEqual: + case NI_AVX512BW_VL_CompareNotEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = + gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareNotEqualMask, simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareNotGreaterThan: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareNotGreaterThanMask, + simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareNotGreaterThanOrEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareNotGreaterThanOrEqualMask, + simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareNotLessThan: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareNotLessThanMask, simdBaseJitType, + simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareNotLessThanOrEqual: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareNotLessThanOrEqualMask, + simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareOrdered: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = + gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareOrderedMask, simdBaseJitType, simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + + case NI_AVX512F_CompareUnordered: + { + assert(sig->numArgs == 2); + + op2 = impSIMDPopStack(); + op1 = impSIMDPopStack(); + + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareUnorderedMask, simdBaseJitType, + simdSize); + retNode = + gtNewSimdHWIntrinsicNode(retType, retNode, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + break; + } + case NI_AVX2_GatherMaskVector128: case NI_AVX2_GatherMaskVector256: { diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 1891e5fc6da09..e1c7b8edeee97 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -718,6 +718,10 @@ INST3(vpsraq, "psraq", IUM_WR, BAD_CODE, PCKD INST3(vpsravq, "psravq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Variable Bit Shift Right Arithmetic INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Bitwise Ternary Logic INST3(vpternlogq, "pternlogq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Bitwise Ternary Logic +INST3(vptestmd, "ptestmd", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x27), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND and set mask +INST3(vptestmq, "ptestmq", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x27), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND and set mask +INST3(vptestnmd, "ptestnmd", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x27), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask +INST3(vptestnmq, "ptestnmq", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x27), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Packed bit-wise XOR of two xmm regs INST3(vrangepd, "rangepd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x50), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Range restriction calculation from a pair of packed double-precision floating-point values INST3(vrangeps, "rangeps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x50), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Range restriction calculation from a pair of packed single-precision floating-point values @@ -805,6 +809,10 @@ INST3(vpmovuswb, "pmovuswb", IUM_WR, PSSE38(0xF3, 0x10), BAD_ INST3(vpsllvw, "psllvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x12), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical INST3(vpsravw, "psravw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x11), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic INST3(vpsrlvw, "psrlvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Logical +INST3(vptestmb, "ptestmb", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x26), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND and set mask +INST3(vptestmw, "ptestmw", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x26), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical AND and set mask +INST3(vptestnmb, "ptestnmb", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x26), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask +INST3(vptestnmw, "ptestnmw", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x26), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask // AVX512CD INST3(vpconflictd, "pconflictd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xC4), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // Detect conflicts within a vector of packed dword values into dense memory/register diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2afbeb1b23df5..4f303007af293 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -359,7 +359,6 @@ class Lowering final : public Phase GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); - GenTree* LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index e3e39498e76b2..9cea6adaa542c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -862,8 +862,7 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn { GenTreeCC* cc = LowerNodeCC(node, condition); - // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments - assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 2 || newIntrinsicId == NI_AVX512F_KORTEST); + assert((HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 2) || (newIntrinsicId == NI_AVX512F_KORTEST)); node->ChangeHWIntrinsicId(newIntrinsicId); node->gtType = TYP_VOID; node->ClearUnusedValue(); @@ -908,9 +907,10 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn } case NI_AVX512F_KORTEST: + case NI_AVX512F_KTEST: { - // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments - assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 1); + // No containment support, so no reason to swap operands + canSwapOperands = false; break; } @@ -1033,16 +1033,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - if (HWIntrinsicInfo::WithAvx512Mask(intrinsicId) && comp->IsBaselineVector512IsaSupportedOpportunistically()) - { - GenTree* nextNode = LowerHWIntrinsicWithAvx512Mask(node); - - if (nextNode != nullptr) - { - return nextNode; - } - } - switch (intrinsicId) { case NI_Vector128_ConditionalSelect: @@ -1675,80 +1665,29 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // just use PTEST. We can't support it for floating-point, however, // as it has both +0.0 and -0.0 where +0.0 == -0.0 - bool skipReplaceOperands = false; - - if (op1->OperIsHWIntrinsic()) - { - GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); - NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); - - switch (op1IntrinsicId) - { - case NI_SSE_And: - case NI_SSE2_And: - case NI_AVX_And: - case NI_AVX2_And: - { - // We can optimize to TestZ(op1.op1, op1.op2) - - node->Op(1) = op1Intrinsic->Op(1); - node->Op(2) = op1Intrinsic->Op(2); - - BlockRange().Remove(op1); - BlockRange().Remove(op2); - - skipReplaceOperands = true; - break; - } - - case NI_SSE_AndNot: - case NI_SSE2_AndNot: - case NI_AVX_AndNot: - case NI_AVX2_AndNot: - { - // We can optimize to TestC(op1.op1, op1.op2) - cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; - - node->Op(1) = op1Intrinsic->Op(1); - node->Op(2) = op1Intrinsic->Op(2); - - BlockRange().Remove(op1); - BlockRange().Remove(op2); - - skipReplaceOperands = true; - break; - } - - default: - { - break; - } - } - } - - if (!skipReplaceOperands) - { - // Default handler, emit a TestZ(op1, op1) - - node->Op(1) = op1; - BlockRange().Remove(op2); + node->Op(1) = op1; + BlockRange().Remove(op2); - LIR::Use op1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(op1Use); - op1 = node->Op(1); + LIR::Use op1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(op1Use); + op1 = node->Op(1); - op2 = comp->gtClone(op1); - BlockRange().InsertAfter(op1, op2); - node->Op(2) = op2; - } + op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + node->Op(2) = op2; if (simdSize == 32) { + // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed? + node->ChangeHWIntrinsicId(NI_AVX_TestZ); LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd); } else { assert(simdSize == 16); + + // TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed? + node->ChangeHWIntrinsicId(NI_SSE41_TestZ); LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd); } @@ -5089,281 +5028,6 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) return tmp1->gtNext; } -//---------------------------------------------------------------------------------------------- -// Lowering::LowerHWIntrinsicWithAvx512Mask: Lowers a HWIntrinsic node that utilizes the AVX512 KMASK registers -// -// Arguments: -// node - The hardware intrinsic node. -// -GenTree* Lowering::LowerHWIntrinsicWithAvx512Mask(GenTreeHWIntrinsic* node) -{ - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - var_types simdBaseType = node->GetSimdBaseType(); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - size_t numArgs = node->GetOperandCount(); - - assert(varTypeIsSIMD(simdType)); - assert(varTypeIsArithmetic(simdBaseType)); - assert(simdSize != 0); - - NamedIntrinsic maskIntrinsicId = NI_Illegal; - GenTree** maskOperand1 = nullptr; - GenTree** maskOperand2 = nullptr; - - switch (intrinsicId) - { - case NI_Vector512_ExtractMostSignificantBits: - { - maskIntrinsicId = NI_AVX512F_MoveMask; - maskOperand1 = &node->Op(1); - break; - } - - case NI_AVX512F_Add: - case NI_AVX512BW_Add: - { - maskIntrinsicId = NI_AVX512F_AddMask; - maskOperand1 = &node->Op(1); - maskOperand2 = &node->Op(2); - break; - } - - case NI_AVX512F_And: - case NI_AVX512DQ_And: - { - maskIntrinsicId = NI_AVX512F_AndMask; - maskOperand1 = &node->Op(1); - maskOperand2 = &node->Op(2); - break; - } - - case NI_AVX512F_AndNot: - case NI_AVX512DQ_AndNot: - { - maskIntrinsicId = NI_AVX512F_AndNotMask; - maskOperand1 = &node->Op(1); - maskOperand2 = &node->Op(2); - break; - } - - case NI_AVX512F_BlendVariable: - case NI_AVX512BW_BlendVariable: - { - maskIntrinsicId = NI_AVX512F_BlendVariableMask; - maskOperand1 = &node->Op(3); - break; - } - - case NI_AVX512F_CompareEqual: - case NI_AVX512BW_CompareEqual: - { - maskIntrinsicId = NI_AVX512F_CompareEqualMask; - break; - } - - case NI_AVX512F_VL_CompareGreaterThan: - case NI_AVX512BW_VL_CompareGreaterThan: - { - assert(varTypeIsUnsigned(simdBaseType)); - FALLTHROUGH; - } - - case NI_AVX512F_CompareGreaterThan: - case NI_AVX512BW_CompareGreaterThan: - { - maskIntrinsicId = NI_AVX512F_CompareGreaterThanMask; - break; - } - - case NI_AVX512F_VL_CompareGreaterThanOrEqual: - case NI_AVX512BW_VL_CompareGreaterThanOrEqual: - { - assert(!varTypeIsFloating(simdBaseType)); - FALLTHROUGH; - } - - case NI_AVX512F_CompareGreaterThanOrEqual: - case NI_AVX512BW_CompareGreaterThanOrEqual: - { - maskIntrinsicId = NI_AVX512F_CompareGreaterThanOrEqualMask; - break; - } - - case NI_AVX512F_VL_CompareLessThan: - case NI_AVX512BW_VL_CompareLessThan: - { - assert(!varTypeIsFloating(simdBaseType)); - FALLTHROUGH; - } - - case NI_AVX512F_CompareLessThan: - case NI_AVX512BW_CompareLessThan: - { - maskIntrinsicId = NI_AVX512F_CompareLessThanMask; - break; - } - - case NI_AVX512F_VL_CompareLessThanOrEqual: - case NI_AVX512BW_VL_CompareLessThanOrEqual: - { - assert(!varTypeIsFloating(simdBaseType)); - FALLTHROUGH; - } - - case NI_AVX512F_CompareLessThanOrEqual: - case NI_AVX512BW_CompareLessThanOrEqual: - { - maskIntrinsicId = NI_AVX512F_CompareLessThanOrEqualMask; - break; - } - - case NI_AVX512F_VL_CompareNotEqual: - case NI_AVX512BW_VL_CompareNotEqual: - { - assert(!varTypeIsFloating(simdBaseType)); - FALLTHROUGH; - } - - case NI_AVX512F_CompareNotEqual: - case NI_AVX512BW_CompareNotEqual: - { - maskIntrinsicId = NI_AVX512F_CompareNotEqualMask; - break; - } - - case NI_AVX512F_CompareNotGreaterThan: - { - maskIntrinsicId = NI_AVX512F_CompareNotGreaterThanMask; - break; - } - - case NI_AVX512F_CompareNotGreaterThanOrEqual: - { - maskIntrinsicId = NI_AVX512F_CompareNotGreaterThanOrEqualMask; - break; - } - - case NI_AVX512F_CompareNotLessThan: - { - maskIntrinsicId = NI_AVX512F_CompareNotLessThanMask; - break; - } - - case NI_AVX512F_CompareNotLessThanOrEqual: - { - maskIntrinsicId = NI_AVX512F_CompareNotLessThanOrEqualMask; - break; - } - - case NI_AVX512F_CompareOrdered: - { - maskIntrinsicId = NI_AVX512F_CompareOrderedMask; - break; - } - - case NI_AVX512F_CompareUnordered: - { - maskIntrinsicId = NI_AVX512F_CompareUnorderedMask; - break; - } - - case NI_AVX512F_Or: - case NI_AVX512DQ_Or: - { - maskIntrinsicId = NI_AVX512F_OrMask; - maskOperand1 = &node->Op(1); - maskOperand2 = &node->Op(2); - break; - } - - case NI_AVX512F_Xor: - case NI_AVX512DQ_Xor: - { - maskIntrinsicId = NI_AVX512F_XorMask; - maskOperand1 = &node->Op(1); - maskOperand2 = &node->Op(2); - break; - } - - default: - { - unreached(); - } - } - - assert(maskIntrinsicId != NI_Illegal); - bool insertMaskToVector = false; - - if (maskOperand1 != nullptr) - { - GenTree* maskOp1 = *maskOperand1; - GenTree* maskOp2 = (maskOperand2 != nullptr) ? *maskOperand2 : nullptr; - - GenTree* op1 = *maskOperand1; - - if (maskOp1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) && - (genTypeSize(maskOp1->AsHWIntrinsic()->GetSimdBaseType()) == genTypeSize(simdBaseType))) - { - if (maskOp2 != nullptr) - { - if (!maskOp2->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) || - (genTypeSize(maskOp2->AsHWIntrinsic()->GetSimdBaseType()) != genTypeSize(simdBaseType))) - { - // We don't want to handle this case, so return - return nullptr; - } - - GenTreeHWIntrinsic* maskToVector2 = maskOp2->AsHWIntrinsic(); - *maskOperand2 = maskToVector2->Op(1); - BlockRange().Remove(maskOp2); - - insertMaskToVector = true; - } - - GenTreeHWIntrinsic* maskToVector1 = maskOp1->AsHWIntrinsic(); - *maskOperand1 = maskToVector1->Op(1); - BlockRange().Remove(maskOp1); - } - else if (maskOp2 == nullptr) - { - assert((maskIntrinsicId == NI_AVX512F_MoveMask) || (maskIntrinsicId == NI_AVX512F_BlendVariableMask)); - - GenTree* vectorToMask = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskOp1, NI_AVX512F_ConvertVectorToMask, - simdBaseJitType, simdSize); - BlockRange().InsertAfter(maskOp1, vectorToMask); - *maskOperand1 = vectorToMask; - } - else - { - // We don't want to handle this case, so return - return nullptr; - } - } - else - { - insertMaskToVector = true; - } - - if (insertMaskToVector) - { - node->gtType = TYP_MASK; - - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - GenTree* maskToVector = comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_ConvertMaskToVector, - simdBaseJitType, simdSize); - BlockRange().InsertAfter(node, maskToVector); - use.ReplaceWith(maskToVector); - } - } - - node->ChangeHWIntrinsicId(maskIntrinsicId); - return LowerNode(node); -} - //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call // diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 3babf06e54161..9266b428f2ac3 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -10693,7 +10693,8 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) return vecCon; } - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + var_types simdBaseType = node->GetSimdBaseType(); switch (intrinsicId) { @@ -10806,6 +10807,163 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) return node; } +#if defined(TARGET_XARCH) + case NI_AVX512F_Add: + case NI_AVX512BW_Add: + case NI_AVX512F_And: + case NI_AVX512DQ_And: + case NI_AVX512F_AndNot: + case NI_AVX512DQ_AndNot: + case NI_AVX512F_Or: + case NI_AVX512DQ_Or: + case NI_AVX512F_Xor: + case NI_AVX512DQ_Xor: + { + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (!op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector) || + !op2->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector)) + { + // We need both operands to be ConvertMaskToVector in + // order to optimize this to a direct mask operation + break; + } + + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic(); + GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic(); + + if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) || + (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize)) + { + // We need both operands to be the same kind of mask; otherwise + // the bitwise operation can differ in how it performs + break; + } + + var_types simdType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + unsigned simdSize = node->GetSimdSize(); + + NamedIntrinsic maskIntrinsicId = NI_Illegal; + + switch (intrinsicId) + { + case NI_AVX512F_Add: + case NI_AVX512BW_Add: + { + maskIntrinsicId = NI_AVX512F_AddMask; + break; + } + + case NI_AVX512F_And: + case NI_AVX512DQ_And: + { + maskIntrinsicId = NI_AVX512F_AndMask; + break; + } + + case NI_AVX512F_AndNot: + case NI_AVX512DQ_AndNot: + { + maskIntrinsicId = NI_AVX512F_AndNotMask; + break; + } + + case NI_AVX512F_Or: + case NI_AVX512DQ_Or: + { + maskIntrinsicId = NI_AVX512F_OrMask; + break; + } + + case NI_AVX512F_Xor: + case NI_AVX512DQ_Xor: + { + maskIntrinsicId = NI_AVX512F_XorMask; + break; + } + + default: + { + unreached(); + } + } + + assert(maskIntrinsicId != NI_Illegal); + + node->ChangeHWIntrinsicId(maskIntrinsicId); + node->gtType = TYP_MASK; + + node->Op(1) = cvtOp1->Op(1); + DEBUG_DESTROY_NODE(op1); + + node->Op(2) = cvtOp2->Op(1); + DEBUG_DESTROY_NODE(op2); + + node = gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_ConvertMaskToVector, simdBaseJitType, simdSize); + + INDEBUG(node->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED); + break; + } + + case NI_AVX512F_ConvertMaskToVector: + { + GenTree* op1 = node->Op(1); + + if (!op1->OperIsHWIntrinsic(NI_AVX512F_ConvertVectorToMask)) + { + break; + } + + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic(); + + if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize)) + { + // We need the operand to be the same kind of mask; otherwise + // the bitwise operation can differ in how it performs + break; + } + + GenTree* vectorNode = op1->AsHWIntrinsic()->Op(1); + + DEBUG_DESTROY_NODE(op1); + DEBUG_DESTROY_NODE(node); + + return vectorNode; + } + + case NI_AVX512F_ConvertVectorToMask: + { + GenTree* op1 = node->Op(1); + + if (!op1->OperIsHWIntrinsic(NI_AVX512F_ConvertMaskToVector)) + { + break; + } + + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic(); + + if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize)) + { + // We need the operand to be the same kind of mask; otherwise + // the bitwise operation can differ in how it performs + break; + } + + GenTree* maskNode = op1->AsHWIntrinsic()->Op(1); + + DEBUG_DESTROY_NODE(op1); + DEBUG_DESTROY_NODE(node); + + return maskNode; + } +#endif // TARGET_XARCH + default: { break; From 80a565f92ff151a838ac8d9e4d65e61590e4161d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 09:00:46 -0700 Subject: [PATCH 07/16] Ensure unique VN for ConvertMaskToVector --- src/coreclr/jit/valuenum.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 0cb86cbd04e75..4bb88f445d414 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -11835,7 +11835,36 @@ void Compiler::fgValueNumberHWIntrinsic(GenTreeHWIntrinsic* tree) } } - tree->gtVNPair = vnStore->VNPWithExc(normalPair, excSetPair); + // Some intrinsics should always be unique + bool makeUnique = false; + +#if defined(TARGET_XARCH) + switch (intrinsicId) + { + case NI_AVX512F_ConvertMaskToVector: + { + // We want to ensure that we get a TYP_MASK local to + // ensure the relevant optimizations can kick in + + makeUnique = true; + break; + } + + default: + { + break; + } + } +#endif // TARGET_XARCH + + if (makeUnique) + { + tree->gtVNPair = vnStore->VNPUniqueWithExc(tree->TypeGet(), excSetPair); + } + else + { + tree->gtVNPair = vnStore->VNPWithExc(normalPair, excSetPair); + } // Currently, the only exceptions these intrinsics could throw are NREs. // From 4b73427d2a12c2a8159ebd0a53a270eefdc9767a Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 12:01:29 -0700 Subject: [PATCH 08/16] Ensure some basic other handling for kmask testing is handled --- src/coreclr/jit/emitxarch.cpp | 13 ++ src/coreclr/jit/lowerxarch.cpp | 210 ++++++++++++++++++++++++++++++--- 2 files changed, 207 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e46ea5c2d9ec2..fcbce344e7a7e 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6246,12 +6246,25 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) case INS_kmovb_msk: case INS_kmovw_msk: case INS_kmovd_msk: + { + // Zero-extends the source + hasSideEffect = true; + break; + } + case INS_kmovq_msk: + { + // No side effect, register is 64-bits + hasSideEffect = false; + break; + } + case INS_kmovb_gpr: case INS_kmovw_gpr: case INS_kmovd_gpr: case INS_kmovq_gpr: { + // Zero-extends the source hasSideEffect = true; break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 9cea6adaa542c..4e51b9d14f7f2 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1191,6 +1191,51 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return LowerHWIntrinsicCmpOp(node, GT_NE); } + case NI_AVX512F_CompareEqualMask: + case NI_AVX512F_CompareNotEqualMask: + { + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + if (op2->IsVectorZero()) + { + NamedIntrinsic testIntrinsicId; + + if (intrinsicId == NI_AVX512F_CompareEqualMask) + { + // We have `CompareEqual(x, Zero)` where a given element + // equaling zero returns 1. We can therefore use `vptestnm(x, x)` + // since it does `~(x & x)` and setting 1 if the result is zero. + + testIntrinsicId = NI_AVX512F_PTESTNM; + } + else + { + // We have `CompareNotEqual(x, Zero)` where a given element + // equaling zero returns 0. We can therefore use `vptestm(x, x)` + // since it does `x & x` and setting 0 if the result is zero. + + assert(intrinsicId == NI_AVX512F_CompareNotEqualMask); + testIntrinsicId = NI_AVX512F_PTESTM; + } + + node->Op(1) = op1; + BlockRange().Remove(op2); + + LIR::Use op1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(op1Use); + op1 = node->Op(1); + + op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + node->Op(2) = op2; + + node->ChangeHWIntrinsicId(testIntrinsicId); + return LowerNode(node); + } + break; + } + case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: @@ -1665,16 +1710,72 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // just use PTEST. We can't support it for floating-point, however, // as it has both +0.0 and -0.0 where +0.0 == -0.0 - node->Op(1) = op1; - BlockRange().Remove(op2); + bool skipReplaceOperands = false; - LIR::Use op1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(op1Use); - op1 = node->Op(1); + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); + NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); + + switch (op1IntrinsicId) + { + case NI_SSE_And: + case NI_SSE2_And: + case NI_AVX_And: + case NI_AVX2_And: + { + // We can optimize to TestZ(op1.op1, op1.op2) + + node->Op(1) = op1Intrinsic->Op(1); + node->Op(2) = op1Intrinsic->Op(2); + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + + skipReplaceOperands = true; + break; + } + + case NI_SSE_AndNot: + case NI_SSE2_AndNot: + case NI_AVX_AndNot: + case NI_AVX2_AndNot: + { + // We can optimize to TestC(op1.op1, op1.op2) + cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; + + node->Op(1) = op1Intrinsic->Op(1); + node->Op(2) = op1Intrinsic->Op(2); + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + + skipReplaceOperands = true; + break; + } + + default: + { + break; + } + } + } + + if (!skipReplaceOperands) + { + // Default handler, emit a TestZ(op1, op1) - op2 = comp->gtClone(op1); - BlockRange().InsertAfter(op1, op2); - node->Op(2) = op2; + node->Op(1) = op1; + BlockRange().Remove(op2); + + LIR::Use op1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(op1Use); + op1 = node->Op(1); + + op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + node->Op(2) = op2; + } if (simdSize == 32) { @@ -1722,6 +1823,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // For other comparisons, using EVEX allows us to avoid leaving the SIMD domain, avoids // needing to use a general-purpose register, and allows us to generate less instructions. + GenTree* maskNode = node; GenTree* nextNode = node->gtNext; NamedIntrinsic maskIntrinsicId = NI_AVX512F_CompareEqualMask; @@ -1779,7 +1881,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // simply consume the mask directly and preserve the intended comparison by tweaking the // compare condition passed down into `KORTEST` - GenTreeHWIntrinsic* maskNode = op1->AsHWIntrinsic()->Op(1)->AsHWIntrinsic(); + maskNode = op1->AsHWIntrinsic()->Op(1); assert(maskNode->TypeIs(TYP_MASK)); bool isHandled = false; @@ -1817,7 +1919,14 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { assert((count == 1) || (count == 2) || (count == 4)); - switch (maskNode->GetHWIntrinsicId()) + maskIntrinsicId = NI_Illegal; + + if (maskNode->OperIsHWIntrinsic()) + { + maskIntrinsicId = maskNode->AsHWIntrinsic()->GetHWIntrinsicId(); + } + + switch (maskIntrinsicId) { case NI_AVX512F_CompareEqualMask: { @@ -1893,11 +2002,14 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm default: { - unreached(); + maskIntrinsicId = NI_AVX512F_NotMask; + maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, maskIntrinsicId, + simdBaseJitType, simdSize); + break; } } - maskNode->ChangeHWIntrinsicId(maskIntrinsicId); + maskNode->AsHWIntrinsic()->ChangeHWIntrinsicId(maskIntrinsicId); } else if (cmpOp == GT_EQ) { @@ -1926,12 +2038,77 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm BlockRange().Remove(op1); BlockRange().Remove(node); - node = maskNode; + op1 = nullptr; + op2 = nullptr; } } - if (node->gtType != TYP_MASK) + if (!varTypeIsFloating(simdBaseType) && (op2 != nullptr) && op2->IsVectorZero()) { + NamedIntrinsic testIntrinsicId = NI_AVX512F_PTESTM; + bool skipReplaceOperands = false; + + if (op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); + NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); + + switch (op1IntrinsicId) + { + case NI_AVX512F_And: + case NI_AVX512DQ_And: + { + // We can optimize since `vptestm` does `(x & y) != 0` + // and `vptestnm` does `(x & y) == 0`. + + if (cmpOp == GT_EQ) + { + testIntrinsicId = NI_AVX512F_PTESTNM; + } + + node->Op(1) = op1Intrinsic->Op(1); + node->Op(2) = op1Intrinsic->Op(2); + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + + skipReplaceOperands = true; + break; + } + + default: + { + // We cannot optimize `AndNot` since `vptestnm` does ~(x & y) + break; + } + } + } + + if (!skipReplaceOperands) + { + node->Op(1) = op1; + BlockRange().Remove(op2); + + LIR::Use op1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(op1Use); + op1 = node->Op(1); + + op2 = comp->gtClone(op1); + BlockRange().InsertAfter(op1, op2); + node->Op(2) = op2; + } + + node->gtType = TYP_MASK; + node->ChangeHWIntrinsicId(testIntrinsicId); + + LowerNode(node); + maskNode = node; + } + + if (maskNode->gtType != TYP_MASK) + { + assert(node == maskNode); + // We have `x == y` or `x != y` both of which where we want to find `AllBitsSet` in the mask since // we can directly do the relevant comparison. Given the above tables then when we have a full mask // we can simply check against `CF == 1` for `op_Equality` and `ZF == 0` for `op_Inequality`. @@ -1963,14 +2140,15 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm node->ChangeHWIntrinsicId(maskIntrinsicId); LowerNode(node); + maskNode = node; } LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) + if (BlockRange().TryGetUse(maskNode, &use)) { GenTreeHWIntrinsic* cc; - cc = comp->gtNewSimdHWIntrinsicNode(simdType, node, NI_AVX512F_KORTEST, simdBaseJitType, simdSize); + cc = comp->gtNewSimdHWIntrinsicNode(simdType, maskNode, NI_AVX512F_KORTEST, simdBaseJitType, simdSize); BlockRange().InsertBefore(nextNode, cc); use.ReplaceWith(cc); From 1e45fdd6593d8c0fa81829539347536fe7a23719 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 07:41:13 -0700 Subject: [PATCH 09/16] Improve the implementation for some managed Vector512 code paths --- .../src/System/SpanHelpers.Char.cs | 7 +-- .../src/System/SpanHelpers.Packed.cs | 51 ++++++++----------- .../src/System/SpanHelpers.T.cs | 14 ++--- 3 files changed, 33 insertions(+), 39 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index adb369b2078e1..1f99365d3e441 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -646,10 +646,10 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) Debug.Assert(lengthToExamine >= Vector512.Count); Vector512 search = *(Vector512*)(searchSpace + (nuint)offset); - ulong matches = Vector512.Equals(Vector512.Zero, search).AsByte().ExtractMostSignificantBits(); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. - if (matches == 0) + if (!Vector512.EqualsAny(search, Vector512.Zero)) { // Zero flags set so no matches offset += Vector512.Count; @@ -659,7 +659,8 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) // Find bitflag offset of first match and add to current offset, // flags are in bytes so divide for chars - return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + ulong matches = Vector512.Equals(search, Vector512.Zero).ExtractMostSignificantBits(); + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } while (lengthToExamine > 0); } diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs index 2f70f00959c1c..e36e1d1e865e6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs @@ -132,9 +132,8 @@ public static bool Contains(ref short searchSpace, short value, int length) Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue, packedSource); - if (result != Vector512.Zero) + if (Vector512.EqualsAny(packedValue, packedSource)) { return true; } @@ -156,9 +155,8 @@ public static bool Contains(ref short searchSpace, short value, int length) Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue, packedSource); - if (result != Vector512.Zero) + if (Vector512.EqualsAny(packedValue, packedSource)) { return true; } @@ -332,12 +330,10 @@ private static int IndexOf(ref short searchSpace, short value, int len Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue, packedSource); - result = NegateIfNeeded(result); - if (result != Vector512.Zero) + if (NegateIfNeeded(Vector512.EqualsAny(packedValue, packedSource))) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, NegateIfNeeded(Vector512.Equals(packedValue, packedSource))); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); @@ -357,12 +353,10 @@ private static int IndexOf(ref short searchSpace, short value, int len Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue, packedSource); - result = NegateIfNeeded(result); - if (result != Vector512.Zero) + if (NegateIfNeeded(Vector512.EqualsAny(packedValue, packedSource))) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, NegateIfNeeded(Vector512.Equals(packedValue, packedSource))); } } } @@ -545,8 +539,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource); - result = NegateIfNeeded(result); + Vector512 result = NegateIfNeeded(Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource)); if (result != Vector512.Zero) { @@ -570,8 +563,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource); - result = NegateIfNeeded(result); + Vector512 result = NegateIfNeeded(Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource)); if (result != Vector512.Zero) { @@ -763,8 +755,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource); - result = NegateIfNeeded(result); + Vector512 result = NegateIfNeeded(Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource)); if (result != Vector512.Zero) { @@ -788,8 +779,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource); - result = NegateIfNeeded(result); + Vector512 result = NegateIfNeeded(Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource)); if (result != Vector512.Zero) { @@ -963,13 +953,11 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI { Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); - Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.LessThanOrEqual(packedSource - lowVector, rangeVector); - result = NegateIfNeeded(result); + Vector512 packedSource = PackSources(source0, source1) - lowVector; - if (result != Vector512.Zero) + if (NegateIfNeeded(Vector512.LessThanOrEqualAny(packedSource, rangeVector))) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, NegateIfNeeded(Vector512.LessThanOrEqual(packedSource, rangeVector))); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); @@ -988,13 +976,11 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); - Vector512 packedSource = PackSources(source0, source1); - Vector512 result = Vector512.LessThanOrEqual(packedSource - lowVector, rangeVector); - result = NegateIfNeeded(result); + Vector512 packedSource = PackSources(source0, source1) - lowVector; - if (result != Vector512.Zero) + if (NegateIfNeeded(Vector512.LessThanOrEqualAny(packedSource, rangeVector))) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, NegateIfNeeded(Vector512.LessThanOrEqual(packedSource, rangeVector))); } } } @@ -1154,6 +1140,11 @@ private static Vector128 PackSources(Vector128 source0, Vector128(bool result) + where TNegator : struct, SpanHelpers.INegator => + typeof(TNegator) == typeof(SpanHelpers.DontNegate) ? result : !result; + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 NegateIfNeeded(Vector128 result) where TNegator : struct, SpanHelpers.INegator => diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs index 0806bbc95daa3..cd891e0b7493f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs @@ -1566,24 +1566,26 @@ internal static int NonPackedIndexOfValueType(ref TValue searc // Loop until either we've finished all elements or there's less than a vector's-worth remaining. do { - equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref currentSearchSpace))); - if (equals == Vector512.Zero) + equals = Vector512.LoadUnsafe(ref currentSearchSpace); + + if (TNegator.NegateIfNeeded(!Vector512.EqualsAny(values, equals))) { currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); continue; } - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, TNegator.NegateIfNeeded(Vector512.Equals(values, equals))); } while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); // If any elements remain, process the last vector in the search space. if ((uint)length % Vector512.Count != 0) { - equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref oneVectorAwayFromEnd))); - if (equals != Vector512.Zero) + equals = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + + if (TNegator.NegateIfNeeded(Vector512.EqualsAny(values, equals))) { - return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, TNegator.NegateIfNeeded(Vector512.Equals(values, equals))); } } } From 31b08936899f94c6709876153e1ae3a59153415d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 12:33:29 -0700 Subject: [PATCH 10/16] Apply formatting patch --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4e51b9d14f7f2..f93bc6a0d68de 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2004,7 +2004,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { maskIntrinsicId = NI_AVX512F_NotMask; maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, maskIntrinsicId, - simdBaseJitType, simdSize); + simdBaseJitType, simdSize); break; } } From dfe3e31d556360543de9376f2464e95a68f210f5 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 13:53:21 -0700 Subject: [PATCH 11/16] Ensure that the knot intrinsic is inserted into the IR --- src/coreclr/jit/lowerxarch.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f93bc6a0d68de..5b3a17eee1440 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2004,12 +2004,14 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { maskIntrinsicId = NI_AVX512F_NotMask; maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, maskIntrinsicId, - simdBaseJitType, simdSize); + simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, maskNode); break; } } maskNode->AsHWIntrinsic()->ChangeHWIntrinsicId(maskIntrinsicId); + LowerNode(maskNode); } else if (cmpOp == GT_EQ) { From 75d4d2506d20dd942804398f489528d9b89566f1 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 20 Jul 2023 18:30:16 -0700 Subject: [PATCH 12/16] Apply formatting patch --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 5b3a17eee1440..06f9cb8894c39 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2004,7 +2004,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { maskIntrinsicId = NI_AVX512F_NotMask; maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, maskIntrinsicId, - simdBaseJitType, simdSize); + simdBaseJitType, simdSize); BlockRange().InsertBefore(node, maskNode); break; } From 07bfc1f41300b933ab68f220b9fab0892406da2a Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 21 Jul 2023 10:39:29 -0700 Subject: [PATCH 13/16] Ensure the conversion of CompareEqualMask(x, zero) to Test(x, x) doesn't happen for floating-point --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 06f9cb8894c39..173d0c0177f75 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1197,7 +1197,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); - if (op2->IsVectorZero()) + if (!varTypeIsFloating(node->GetSimdBaseType()) && op2->IsVectorZero()) { NamedIntrinsic testIntrinsicId; From 136e898309cd7cd62c2193135e8f90d40908e781 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Sat, 22 Jul 2023 08:58:21 -0700 Subject: [PATCH 14/16] Have callee/callerSaveRegs() use an array based lookup --- src/coreclr/jit/compiler.cpp | 8 +++--- src/coreclr/jit/emit.cpp | 4 +-- src/coreclr/jit/lsra.cpp | 2 +- src/coreclr/jit/lsra.h | 46 +++++++++++-------------------- src/coreclr/jit/typelist.h | 51 ++++++++++++++++++----------------- src/coreclr/jit/utils.cpp | 6 ++--- src/coreclr/jit/vartypesdef.h | 2 +- 7 files changed, 54 insertions(+), 65 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index ad9c37ec79343..24150005e9954 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -106,25 +106,25 @@ inline bool _our_GetThreadCycles(unsigned __int64* cycleOut) #endif // which host OS const BYTE genTypeSizes[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sz, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sz, #include "typelist.h" #undef DEF_TP }; const BYTE genTypeAlignments[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) al, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) al, #include "typelist.h" #undef DEF_TP }; const BYTE genTypeStSzs[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) st, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) st, #include "typelist.h" #undef DEF_TP }; const BYTE genActualTypes[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) jitType, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) jitType, #include "typelist.h" #undef DEF_TP }; diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index b241d0ddaf425..5d8d4bbc08092 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -590,13 +590,13 @@ void emitterStats(FILE* fout) /*****************************************************************************/ const unsigned short emitTypeSizes[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) sze, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) sze, #include "typelist.h" #undef DEF_TP }; const unsigned short emitTypeActSz[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) asze, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) asze, #include "typelist.h" #undef DEF_TP }; diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index ee81f7d815d66..24b071bdc4dea 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -806,7 +806,7 @@ LinearScan::LinearScan(Compiler* theCompiler) // Initialize the availableRegs to use for each TYP_* CLANG_FORMAT_COMMENT_ANCHOR; -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) \ +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) \ availableRegs[static_cast(TYP_##tn)] = ®Fld; #include "typelist.h" #undef DEF_TP diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index bc2660e57ba99..8ebf1c46782ab 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2066,21 +2066,14 @@ class LinearScan : public LinearScanInterface // static regMaskTP calleeSaveRegs(RegisterType rt) { - if (varTypeUsesIntReg(rt)) - { - return RBM_INT_CALLEE_SAVED; - } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (varTypeUsesMaskReg(rt)) - { - return RBM_MSK_CALLEE_SAVED; - } -#endif // TARGET_XARCH && FEATURE_SIMD - else - { - assert(varTypeUsesFloatReg(rt)); - return RBM_FLT_CALLEE_SAVED; - } + static const regMaskTP varTypeCalleeSaveRegs[] = { +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) csr, +#include "typelist.h" +#undef DEF_TP + }; + + assert((unsigned)rt < ArrLen(varTypeCalleeSaveRegs)); + return varTypeCalleeSaveRegs[rt]; } //------------------------------------------------------------------------ @@ -2088,21 +2081,14 @@ class LinearScan : public LinearScanInterface // regMaskTP callerSaveRegs(RegisterType rt) const { - if (varTypeUsesIntReg(rt)) - { - return RBM_INT_CALLEE_TRASH; - } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (varTypeUsesMaskReg(rt)) - { - return RBM_MSK_CALLEE_TRASH; - } -#endif // TARGET_XARCH && FEATURE_SIMD - else - { - assert(varTypeUsesFloatReg(rt)); - return RBM_FLT_CALLEE_TRASH; - } + static const regMaskTP varTypeCalleeTrashRegs[] = { +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr, +#include "typelist.h" +#undef DEF_TP + }; + + assert((unsigned)rt < ArrLen(varTypeCalleeTrashRegs)); + return varTypeCalleeTrashRegs[rt]; } }; diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 81b06c1917d8d..dad396b6f6335 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -25,47 +25,50 @@ al - alignment regTyp - LSRA: type of register to use regFld - LSRA: field to use to track available registers + csr - LSRA: registers to use for callee save (caller trash) + ctr - LSRA: registers to use for callee trash (caller save) tf - flags -DEF_TP(tn ,nm , jitType, sz,sze,asze, st,al, regTyp, regFld, tf ) +DEF_TP(tn ,nm , jitType, sz,sze,asze, st,al,regTyp, regFld, csr ctr +tf ) */ // clang-format off -DEF_TP(UNDEF ,"" , TYP_UNDEF, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, VTF_ANY) -DEF_TP(VOID ,"void" , TYP_VOID, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, VTF_ANY) +DEF_TP(UNDEF ,"" , TYP_UNDEF, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_ANY) +DEF_TP(VOID ,"void" , TYP_VOID, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_ANY) -DEF_TP(BOOL ,"bool" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, VTF_INT|VTF_UNS) -DEF_TP(BYTE ,"byte" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, VTF_INT) -DEF_TP(UBYTE ,"ubyte" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, VTF_INT|VTF_UNS) +DEF_TP(BOOL ,"bool" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_UNS) +DEF_TP(BYTE ,"byte" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT) +DEF_TP(UBYTE ,"ubyte" , TYP_INT, 1, 1, 4, 1, 1, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_UNS) -DEF_TP(SHORT ,"short" , TYP_INT, 2, 2, 4, 1, 2, VTR_INT, availableIntRegs, VTF_INT) -DEF_TP(USHORT ,"ushort" , TYP_INT, 2, 2, 4, 1, 2, VTR_INT, availableIntRegs, VTF_INT|VTF_UNS) +DEF_TP(SHORT ,"short" , TYP_INT, 2, 2, 4, 1, 2, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT) +DEF_TP(USHORT ,"ushort" , TYP_INT, 2, 2, 4, 1, 2, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_UNS) -DEF_TP(INT ,"int" , TYP_INT, 4, 4, 4, 1, 4, VTR_INT, availableIntRegs, VTF_INT|VTF_I32) -DEF_TP(UINT ,"uint" , TYP_INT, 4, 4, 4, 1, 4, VTR_INT, availableIntRegs, VTF_INT|VTF_UNS|VTF_I32) // Only used in GT_CAST nodes +DEF_TP(INT ,"int" , TYP_INT, 4, 4, 4, 1, 4, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_I32) +DEF_TP(UINT ,"uint" , TYP_INT, 4, 4, 4, 1, 4, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_UNS|VTF_I32) // Only used in GT_CAST nodes -DEF_TP(LONG ,"long" , TYP_LONG, 8,EPS,EPS, 2, 8, VTR_INT, availableIntRegs, VTF_INT|VTF_I64) -DEF_TP(ULONG ,"ulong" , TYP_LONG, 8,EPS,EPS, 2, 8, VTR_INT, availableIntRegs, VTF_INT|VTF_UNS|VTF_I64) // Only used in GT_CAST nodes +DEF_TP(LONG ,"long" , TYP_LONG, 8,EPS,EPS, 2, 8, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_I64) +DEF_TP(ULONG ,"ulong" , TYP_LONG, 8,EPS,EPS, 2, 8, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_INT|VTF_UNS|VTF_I64) // Only used in GT_CAST nodes -DEF_TP(FLOAT ,"float" , TYP_FLOAT, 4, 4, 4, 1, 4, VTR_FLOAT, availableFloatRegs, VTF_FLT) -DEF_TP(DOUBLE ,"double" , TYP_DOUBLE, 8, 8, 8, 2, 8, VTR_FLOAT, availableDoubleRegs, VTF_FLT) +DEF_TP(FLOAT ,"float" , TYP_FLOAT, 4, 4, 4, 1, 4, VTR_FLOAT, availableFloatRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_FLT) +DEF_TP(DOUBLE ,"double" , TYP_DOUBLE, 8, 8, 8, 2, 8, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_FLT) -DEF_TP(REF ,"ref" , TYP_REF, PS,GCS,GCS, PST,PS, VTR_INT, availableIntRegs, VTF_ANY|VTF_GCR|VTF_I) -DEF_TP(BYREF ,"byref" , TYP_BYREF, PS,BRS,BRS, PST,PS, VTR_INT, availableIntRegs, VTF_ANY|VTF_BYR|VTF_I) -DEF_TP(STRUCT ,"struct" , TYP_STRUCT, 0, 0, 0, 1, 4, VTR_INT, availableIntRegs, VTF_S) +DEF_TP(REF ,"ref" , TYP_REF, PS,GCS,GCS, PST,PS,VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_ANY|VTF_GCR|VTF_I) +DEF_TP(BYREF ,"byref" , TYP_BYREF, PS,BRS,BRS, PST,PS,VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_ANY|VTF_BYR|VTF_I) +DEF_TP(STRUCT ,"struct" , TYP_STRUCT, 0, 0, 0, 1, 4, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_S) #ifdef FEATURE_SIMD -DEF_TP(SIMD8 ,"simd8" , TYP_SIMD8, 8, 8, 8, 2, 8, VTR_FLOAT, availableDoubleRegs, VTF_S|VTF_VEC) -DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, 12,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, VTF_S|VTF_VEC) -DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, 16,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, VTF_S|VTF_VEC) +DEF_TP(SIMD8 ,"simd8" , TYP_SIMD8, 8, 8, 8, 2, 8, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) +DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, 12,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) +DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, 16,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) #if defined(TARGET_XARCH) -DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, 32,32, 32, 8,16, VTR_FLOAT, availableDoubleRegs, VTF_S|VTF_VEC) -DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, 64,64, 64, 16,16, VTR_FLOAT, availableDoubleRegs, VTF_S|VTF_VEC) -DEF_TP(MASK ,"mask" , TYP_MASK, 8, 8, 8, 2, 8, VTR_MASK, availableMaskRegs, VTF_ANY) +DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, 32,32, 32, 8,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) +DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, 64,64, 64, 16,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) +DEF_TP(MASK ,"mask" , TYP_MASK, 8, 8, 8, 2, 8, VTR_MASK, availableMaskRegs, RBM_MSK_CALLEE_SAVED, RBM_MSK_CALLEE_TRASH, VTF_S) #endif // TARGET_XARCH #endif // FEATURE_SIMD -DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, VTF_ANY) +DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, 0, 0, 0, 0, 0, VTR_INT, availableIntRegs, RBM_INT_CALLEE_SAVED, RBM_INT_CALLEE_TRASH, VTF_ANY) // clang-format on #undef GCS diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index 93a8292e07ab9..404bc7f6dedfd 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -79,13 +79,13 @@ const signed char opcodeSizes[] = // clang-format on const BYTE varTypeClassification[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) tf, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) tf, #include "typelist.h" #undef DEF_TP }; const BYTE varTypeRegister[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) regTyp, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) regTyp, #include "typelist.h" #undef DEF_TP }; @@ -111,7 +111,7 @@ extern const BYTE opcodeArgKinds[] = { const char* varTypeName(var_types vt) { static const char* const varTypeNames[] = { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) nm, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) nm, #include "typelist.h" #undef DEF_TP }; diff --git a/src/coreclr/jit/vartypesdef.h b/src/coreclr/jit/vartypesdef.h index dc27ac3adb14b..6f79e52e0334e 100644 --- a/src/coreclr/jit/vartypesdef.h +++ b/src/coreclr/jit/vartypesdef.h @@ -8,7 +8,7 @@ enum var_types : BYTE { -#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, tf) TYP_##tn, +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) TYP_##tn, #include "typelist.h" #undef DEF_TP TYP_COUNT From 8a0c9a3145e3712d7eeb5c1c3d113e34e17a271c Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 24 Jul 2023 10:06:22 -0700 Subject: [PATCH 15/16] Respond to PR feedback and try to reduce TP regression more --- src/coreclr/jit/compiler.cpp | 12 +- src/coreclr/jit/compiler.h | 13 +- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 111 ++++++++++++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 2 + src/coreclr/jit/lowerxarch.cpp | 31 ++++- src/coreclr/jit/lsra.cpp | 4 +- src/coreclr/jit/lsra.h | 23 ++-- src/coreclr/jit/lsrabuild.cpp | 6 +- src/coreclr/jit/vartype.h | 14 ++- .../src/System/SpanHelpers.Char.cs | 8 +- 10 files changed, 200 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 24150005e9954..79fc72cd08bd6 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3386,9 +3386,19 @@ void Compiler::compInitOptions(JitFlags* jitFlags) { rbmAllMask |= RBM_ALLMASK_EVEX; rbmMskCalleeTrash |= RBM_MSK_CALLEE_TRASH_EVEX; - cntCalleeTrashMask += CNT_CALLEE_TRASH_MASK; + cntCalleeTrashMask += CNT_CALLEE_TRASH_MASK_EVEX; } + // Make sure we copy the register info and initialize the + // trash regs after the underlying fields are initialized + + const regMaskTP vtCalleeTrashRegs[TYP_COUNT]{ +#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr, +#include "typelist.h" +#undef DEF_TP + }; + memcpy(varTypeCalleeTrashRegs, vtCalleeTrashRegs, sizeof(regMaskTP) * TYP_COUNT); + codeGen->CopyRegisterInfo(); #endif // TARGET_XARCH } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 965d9809b3d4d..8ba162bd2de6f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10899,15 +10899,15 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned cntCalleeTrashFloat; public: - regMaskTP get_RBM_ALLFLOAT() const + FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const { return this->rbmAllFloat; } - regMaskTP get_RBM_FLT_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const { return this->rbmFltCalleeTrash; } - unsigned get_CNT_CALLEE_TRASH_FLOAT() const + FORCEINLINE unsigned get_CNT_CALLEE_TRASH_FLOAT() const { return this->cntCalleeTrashFloat; } @@ -10935,17 +10935,18 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regMaskTP rbmAllMask; regMaskTP rbmMskCalleeTrash; unsigned cntCalleeTrashMask; + regMaskTP varTypeCalleeTrashRegs[TYP_COUNT]; public: - regMaskTP get_RBM_ALLMASK() const + FORCEINLINE regMaskTP get_RBM_ALLMASK() const { return this->rbmAllMask; } - regMaskTP get_RBM_MSK_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const { return this->rbmMskCalleeTrash; } - unsigned get_CNT_CALLEE_TRASH_MASK() const + FORCEINLINE unsigned get_CNT_CALLEE_TRASH_MASK() const { return this->cntCalleeTrashMask; } diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 8f10f828c0957..accf1fc62552d 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -2136,6 +2136,39 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_NotMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_knotb; + } + else if (count == 16) + { + ins = INS_knotw; + } + else if (count == 32) + { + ins = INS_knotd; + } + else + { + assert(count == 64); + ins = INS_knotq; + } + + op1Reg = op1->GetRegNum(); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + + emit->emitIns_R_R(ins, EA_8BYTE, targetReg, op1Reg); + break; + } + case NI_AVX512F_OrMask: { uint32_t simdSize = node->GetSimdSize(); @@ -2174,6 +2207,84 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_ShiftLeftMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kshiftlb; + } + else if (count == 16) + { + ins = INS_kshiftlw; + } + else if (count == 32) + { + ins = INS_kshiftld; + } + else + { + assert(count == 64); + ins = INS_kshiftlq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + assert(op2->IsCnsIntOrI() && op2->isContained()); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + + ssize_t ival = op2->AsIntCon()->IconValue(); + assert((ival >= 0) && (ival <= 255)); + + emit->emitIns_R_R_I(ins, EA_8BYTE, targetReg, op1Reg, (int8_t)ival); + break; + } + + case NI_AVX512F_ShiftRightMask: + { + uint32_t simdSize = node->GetSimdSize(); + uint32_t count = simdSize / genTypeSize(baseType); + + if (count <= 8) + { + assert((count == 2) || (count == 4) || (count == 8)); + ins = INS_kshiftrb; + } + else if (count == 16) + { + ins = INS_kshiftrw; + } + else if (count == 32) + { + ins = INS_kshiftrd; + } + else + { + assert(count == 64); + ins = INS_kshiftrq; + } + + op1Reg = op1->GetRegNum(); + + GenTree* op2 = node->Op(2); + assert(op2->IsCnsIntOrI() && op2->isContained()); + + assert(emitter::isMaskReg(targetReg)); + assert(emitter::isMaskReg(op1Reg)); + + ssize_t ival = op2->AsIntCon()->IconValue(); + assert((ival >= 0) && (ival <= 255)); + + emit->emitIns_R_R_I(ins, EA_8BYTE, targetReg, op1Reg, (int8_t)ival); + break; + } + case NI_AVX512F_XorMask: { uint32_t simdSize = node->GetSimdSize(); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 20c3868012373..b17e7d7a3a8ce 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1334,6 +1334,8 @@ HARDWARE_INTRINSIC(AVX512F, NotMask, HARDWARE_INTRINSIC(AVX512F, op_EqualityMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, op_InequalityMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, OrMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512F, ShiftLeftMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F, ShiftRightMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 173d0c0177f75..f40acc150de1a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2002,10 +2002,37 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm default: { - maskIntrinsicId = NI_AVX512F_NotMask; - maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, maskIntrinsicId, + // We don't have a well known intrinsic, so we need to inverse the mask keeping the upper + // n-bits clear. If we have 1 element, then the upper 7-bits need to be cleared. If we have + // 2, then the upper 6-bits, and if we have 4, then the upper 4-bits. + // + // There isn't necessarily a trivial way to do this outside not, shift-left by n, + // shift-right by n. This preserves count bits, while clearing the upper n-bits + + GenTree* cnsNode; + + maskNode = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, NI_AVX512F_NotMask, simdBaseJitType, simdSize); BlockRange().InsertBefore(node, maskNode); + + cnsNode = comp->gtNewIconNode(8 - count); + BlockRange().InsertAfter(maskNode, cnsNode); + + maskNode = + comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, cnsNode, NI_AVX512F_ShiftLeftMask, + simdBaseJitType, simdSize); + BlockRange().InsertAfter(cnsNode, maskNode); + LowerNode(maskNode); + + cnsNode = comp->gtNewIconNode(8 - count); + BlockRange().InsertAfter(maskNode, cnsNode); + + maskNode = + comp->gtNewSimdHWIntrinsicNode(TYP_MASK, maskNode, cnsNode, NI_AVX512F_ShiftRightMask, + simdBaseJitType, simdSize); + BlockRange().InsertAfter(cnsNode, maskNode); + + maskIntrinsicId = NI_AVX512F_ShiftRightMask; break; } } diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 24b071bdc4dea..485ba019ed55f 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -716,11 +716,11 @@ LinearScan::LinearScan(Compiler* theCompiler) #if defined(TARGET_XARCH) rbmAllMask = compiler->rbmAllMask; rbmMskCalleeTrash = compiler->rbmMskCalleeTrash; + memcpy(varTypeCalleeTrashRegs, compiler->varTypeCalleeTrashRegs, sizeof(regMaskTP) * TYP_COUNT); if (!compiler->canUseEvexEncoding()) { - availableRegCount -= CNT_HIGHFLOAT; - availableRegCount -= CNT_MASK_REGS; + availableRegCount -= (CNT_HIGHFLOAT + CNT_MASK_REGS); } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 8ebf1c46782ab..20941e45f9d1b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2027,11 +2027,11 @@ class LinearScan : public LinearScanInterface regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; - regMaskTP get_RBM_ALLFLOAT() const + FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const { return this->rbmAllFloat; } - regMaskTP get_RBM_FLT_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const { return this->rbmFltCalleeTrash; } @@ -2041,11 +2041,11 @@ class LinearScan : public LinearScanInterface regMaskTP rbmAllMask; regMaskTP rbmMskCalleeTrash; - regMaskTP get_RBM_ALLMASK() const + FORCEINLINE regMaskTP get_RBM_ALLMASK() const { return this->rbmAllMask; } - regMaskTP get_RBM_MSK_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const { return this->rbmMskCalleeTrash; } @@ -2053,7 +2053,7 @@ class LinearScan : public LinearScanInterface unsigned availableRegCount; - unsigned get_AVAILABLE_REG_COUNT() const + FORCEINLINE unsigned get_AVAILABLE_REG_COUNT() const { return this->availableRegCount; } @@ -2064,7 +2064,7 @@ class LinearScan : public LinearScanInterface // NOTE: we currently don't need a LinearScan `this` pointer for this definition, and some callers // don't have one available, so make is static. // - static regMaskTP calleeSaveRegs(RegisterType rt) + static FORCEINLINE regMaskTP calleeSaveRegs(RegisterType rt) { static const regMaskTP varTypeCalleeSaveRegs[] = { #define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) csr, @@ -2076,16 +2076,25 @@ class LinearScan : public LinearScanInterface return varTypeCalleeSaveRegs[rt]; } +#if defined(TARGET_XARCH) + // Not all of the callee trash values are constant, so don't declare this as a method local static + // doing so results in significantly more complex codegen and we'd rather just initialize this once + // as part of initializing LSRA instead + regMaskTP varTypeCalleeTrashRegs[TYP_COUNT]; +#endif // TARGET_XARCH + //------------------------------------------------------------------------ // callerSaveRegs: Get the set of caller-save registers of the given RegisterType // - regMaskTP callerSaveRegs(RegisterType rt) const + FORCEINLINE regMaskTP callerSaveRegs(RegisterType rt) const { +#if !defined(TARGET_XARCH) static const regMaskTP varTypeCalleeTrashRegs[] = { #define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr, #include "typelist.h" #undef DEF_TP }; +#endif // !TARGET_XARCH assert((unsigned)rt < ArrLen(varTypeCalleeTrashRegs)); return varTypeCalleeTrashRegs[rt]; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 12aad1c8c77de..8c9025f61b703 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -891,10 +891,10 @@ regMaskTP LinearScan::getKillSetForCall(GenTreeCall* call) // if there is no FP used, we can ignore the FP kills if (!compiler->compFloatingPointUsed) { - killMask &= ~RBM_FLT_CALLEE_TRASH; - #if defined(TARGET_XARCH) - killMask &= ~RBM_MSK_CALLEE_TRASH; + killMask &= ~(RBM_FLT_CALLEE_TRASH | RBM_MSK_CALLEE_TRASH); +#else + killMask &= ~RBM_FLT_CALLEE_TRASH; #endif // TARGET_XARCH } #ifdef TARGET_ARM diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 316bd2a867430..116d5ce2c0519 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -328,7 +328,19 @@ inline bool varTypeUsesFloatReg(T vt) template inline bool varTypeUsesMaskReg(T vt) { - return varTypeRegister[TypeGet(vt)] == VTR_MASK; +// The technically correct check is: +// return varTypeRegister[TypeGet(vt)] == VTR_MASK; +// +// However, we only have one type that uses VTR_MASK today +// and so its quite a bit cheaper to just check that directly + +#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) + assert((TypeGet(vt) == TYP_MASK) || (varTypeRegister[TypeGet(vt)] != VTR_MASK)); + return TypeGet(vt) == TYP_MASK; +#else + assert(varTypeRegister[TypeGet(vt)] != VTR_MASK); + return false; +#endif } template diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 1f99365d3e441..b6d92204f9d23 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -647,8 +647,9 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) Vector512 search = *(Vector512*)(searchSpace + (nuint)offset); - // Note that MoveMask has converted the equal vector elements into a set of bit flags, - // So the bit position in 'matches' corresponds to the element offset. + // AVX-512 returns comparison results in a mask register, so we want to optimize + // the core check to simply be an "none match" check. This will slightly increase + // the cost for the early match case, but greatly improves perf otherwise. if (!Vector512.EqualsAny(search, Vector512.Zero)) { // Zero flags set so no matches @@ -657,6 +658,9 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) continue; } + // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + // // Find bitflag offset of first match and add to current offset, // flags are in bytes so divide for chars ulong matches = Vector512.Equals(search, Vector512.Zero).ExtractMostSignificantBits(); From 43ad9a0e048422fa0b38f5a7c56593627e1b7a3d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 25 Jul 2023 09:37:08 -0700 Subject: [PATCH 16/16] Ensure PTEST doesn't try to handle something utilizing embedded broadcast --- src/coreclr/jit/codegeninterface.h | 8 +++--- src/coreclr/jit/emit.h | 4 +-- src/coreclr/jit/lowerxarch.cpp | 44 +++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index 4cfd462567d5b..bd931e598384a 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -63,11 +63,11 @@ class CodeGenInterface regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; - regMaskTP get_RBM_ALLFLOAT() const + FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const { return this->rbmAllFloat; } - regMaskTP get_RBM_FLT_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const { return this->rbmFltCalleeTrash; } @@ -80,11 +80,11 @@ class CodeGenInterface // Call this function after the equivalent fields in Compiler have been initialized. void CopyRegisterInfo(); - regMaskTP get_RBM_ALLMASK() const + FORCEINLINE regMaskTP get_RBM_ALLMASK() const { return this->rbmAllMask; } - regMaskTP get_RBM_MSK_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const { return this->rbmMskCalleeTrash; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 1ec95b21c03a0..fb3ba74e0e44b 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2305,7 +2305,7 @@ class emitter #if defined(TARGET_AMD64) regMaskTP rbmFltCalleeTrash; - regMaskTP get_RBM_FLT_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const { return this->rbmFltCalleeTrash; } @@ -2314,7 +2314,7 @@ class emitter #if defined(TARGET_XARCH) regMaskTP rbmMskCalleeTrash; - regMaskTP get_RBM_MSK_CALLEE_TRASH() const + FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const { return this->rbmMskCalleeTrash; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f40acc150de1a..08879e7143c84 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1717,6 +1717,19 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); + GenTree* nestedOp1 = nullptr; + GenTree* nestedOp2 = nullptr; + bool isEmbeddedBroadcast = false; + + if (op1Intrinsic->GetOperandCount() == 2) + { + nestedOp1 = op1Intrinsic->Op(1); + nestedOp2 = op1Intrinsic->Op(2); + + assert(!nestedOp1->isContained()); + isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic(); + } + switch (op1IntrinsicId) { case NI_SSE_And: @@ -1726,8 +1739,14 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { // We can optimize to TestZ(op1.op1, op1.op2) - node->Op(1) = op1Intrinsic->Op(1); - node->Op(2) = op1Intrinsic->Op(2); + if (isEmbeddedBroadcast) + { + // PTEST doesn't support embedded broadcast + break; + } + + node->Op(1) = nestedOp1; + node->Op(2) = nestedOp2; BlockRange().Remove(op1); BlockRange().Remove(op2); @@ -1742,10 +1761,17 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm case NI_AVX2_AndNot: { // We can optimize to TestC(op1.op1, op1.op2) + + if (isEmbeddedBroadcast) + { + // PTEST doesn't support embedded broadcast + break; + } + cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; - node->Op(1) = op1Intrinsic->Op(1); - node->Op(2) = op1Intrinsic->Op(2); + node->Op(1) = nestedOp1; + node->Op(2) = nestedOp2; BlockRange().Remove(op1); BlockRange().Remove(op2); @@ -8882,6 +8908,16 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_AVX512F_ShiftLeftMask: + case NI_AVX512F_ShiftRightMask: + { + // These intrinsics don't support a memory operand and + // we don't currently generate a jmp table fallback. + + assert(isContainedImm); + return; + } + default: { assert(!"Unhandled containment for binary hardware intrinsic with immediate operand");