Skip to content

Commit

Permalink
Optimize Vector128 and Vector256.Create methods (#35857)
Browse files Browse the repository at this point in the history
* Updating Vector128.Create(T value) to be intrinsic

* Updating Vector128.Create(T, ..., T) to be intrinsic

* Updating Vector256.Create(T) and Vector256.Create(T, ..., T) to be intrinsic

* Applying formatting patch

* Adding additional comments explaining how the Vector128.Create and Vector256.Create calls are lowered

* Add an assert that argCnt is as expected for LowerHWIntrinsicCreate

* Applying formatting patch

* Use unsigned rather than signed types in LowerHWIntrinsicCreate

* Have HWIntrinsicInfo::lookupId take the number of arguments into account

* Adjusting Vector128.Create(T, ..., T) to not be recursive on ARM64

* Fixing MyICJI::allocMem in superpmi to respect certain CorJitAllocMemFlag

* Avoid an implicit downcast
  • Loading branch information
tannergooding authored May 8, 2020
1 parent eb7198f commit 4d16376
Show file tree
Hide file tree
Showing 12 changed files with 1,491 additions and 495 deletions.
49 changes: 47 additions & 2 deletions src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param)
return RunWithErrorTrap(function, param);
}

// Ideally we'd just use the copies of this in standardmacros.h
// however, superpmi is missing various other dependencies as well
static size_t ALIGN_UP_SPMI(size_t val, size_t alignment)
{
return (val + (alignment - 1)) & ~(alignment - 1);
}

static void* ALIGN_UP_SPMI(void* val, size_t alignment)
{
return (void*)ALIGN_UP_SPMI((size_t)val, alignment);
}

// get a block of memory for the code, readonly data, and read-write data
void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
ULONG coldCodeSize, /* IN */
Expand All @@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
)
{
jitInstance->mc->cr->AddCall("allocMem");
// TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0?

// TODO-Cleanup: Could hot block size be ever 0?
*hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);

if (coldCodeSize > 0)
*coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
else
*coldCodeBlock = nullptr;
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataSize);

if (roDataSize > 0)
{
size_t roDataAlignment = sizeof(void*);
size_t roDataAlignedSize = static_cast<size_t>(roDataSize);

if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0)
{
roDataAlignment = 32;
}
else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0)
{
roDataAlignment = 16;
}
else if (roDataSize >= 8)
{
roDataAlignment = 8;
}

// We need to round the roDataSize up to the alignment size and then
// overallocate by at most alignment - sizeof(void*) to ensure that
// we can offset roDataBlock to be an aligned address and that the
// allocation contains at least the originally requested size after

roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment);
roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*));
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize);
*roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment);
}
else
*roDataBlock = nullptr;

jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock,
coldCodeBlock, roDataBlock);
}
Expand Down
7 changes: 4 additions & 3 deletions src/coreclr/src/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
{
// Data can have any size but since alignment is deduced from the size there's no
// way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
// 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
assert(size <= 16);
// As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
// alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
assert((size <= 16) || ((size % 16) == 0));

if (size == 16)
if (size >= 16)
{
emitConsDsc.align16 = true;
}
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
}

// Check that the offset is properly aligned (i.e. the ddd in [ddd])
assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
assert((emitChkAlign == false) || (ins == INS_lea) ||
((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
}
else
{
Expand Down
24 changes: 18 additions & 6 deletions src/coreclr/src/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va
// lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet
//
// Arguments:
// comp -- The compiler
// sig -- The signature of the intrinsic
// className -- The name of the class associated with the HWIntrinsic to lookup
// methodName -- The name of the method associated with the HWIntrinsic to lookup
// enclosingClassName -- The name of the enclosing class of X64 classes
//
// Return Value:
// The NamedIntrinsic associated with methodName and isa
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName)
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName)
{
// TODO-Throughput: replace sequential search by binary search
CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName);
Expand All @@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,

for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++)
{
const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i];

if (isa != hwIntrinsicInfoArray[i].isa)
{
continue;
}

if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0)
int numArgs = static_cast<unsigned>(intrinsicInfo.numArgs);

if ((numArgs != -1) && (sig->numArgs != static_cast<unsigned>(intrinsicInfo.numArgs)))
{
continue;
}

if (strcmp(methodName, intrinsicInfo.name) == 0)
{
return hwIntrinsicInfoArray[i].id;
return intrinsicInfo.id;
}
}

Expand Down
9 changes: 5 additions & 4 deletions src/coreclr/src/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,11 @@ struct HWIntrinsicInfo

static const HWIntrinsicInfo& lookup(NamedIntrinsic id);

static NamedIntrinsic lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName);
static NamedIntrinsic lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName);
static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName);

static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig);
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2,
HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
// The instruction generated for float/double depends on which ISAs are supported
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
Expand Down Expand Up @@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256,
HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
Expand Down
65 changes: 65 additions & 0 deletions src/coreclr/src/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;

if (!featureSIMD)
{
Expand Down Expand Up @@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector128_Create:
case NI_Vector256_Create:
{
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
// TODO-XARCH-CQ: It may be beneficial to emit the movq
// instruction, which takes a 64-bit memory address and
// works on 32-bit x86 systems.
break;
}
#endif // TARGET_X86

// We shouldn't handle this as an intrinsic if the
// respective ISAs have been disabled by the user.

if (intrinsic == NI_Vector256_Create)
{
if (!compExactlyDependsOn(InstructionSet_AVX))
{
break;
}
}
else if (baseType == TYP_FLOAT)
{
if (!compExactlyDependsOn(InstructionSet_SSE))
{
break;
}
}
else if (!compExactlyDependsOn(InstructionSet_SSE2))
{
break;
}

if (sig->numArgs == 1)
{
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
else if (sig->numArgs == 2)
{
op2 = impPopStack().val;
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
}
else
{
assert(sig->numArgs >= 3);

GenTreeArgList* tmp = nullptr;

for (unsigned i = 0; i < sig->numArgs; i++)
{
tmp = gtNewArgList(impPopStack().val);
tmp->gtOp2 = op1;
op1 = tmp;
}

retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
break;
}

case NI_Vector128_CreateScalarUnsafe:
{
assert(sig->numArgs == 1);
Expand Down
5 changes: 4 additions & 1 deletion src/coreclr/src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)

if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0))
{
result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName);
CORINFO_SIG_INFO sig;
info.compCompHnd->getMethodSig(method, &sig);

result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
}
else if (strcmp(methodName, "get_IsSupported") == 0)
{
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/src/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ class Lowering final : public Phase
#ifdef FEATURE_HW_INTRINSICS
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS

Expand Down
Loading

0 comments on commit 4d16376

Please sign in to comment.