-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ensure Vector256.Dot produces a V256 result #88712
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4462,29 +4462,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) | |
assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); | ||
|
||
// We will be constructing the following parts: | ||
// idx = CNS_INT int 0xF1 | ||
// idx = CNS_INT int 0xFF | ||
// /--* op1 simd16 | ||
// +--* op2 simd16 | ||
// +--* idx int | ||
// tmp1 = * HWINTRINSIC simd32 T DotProduct | ||
// /--* tmp1 simd32 | ||
// * STORE_LCL_VAR simd32 | ||
// tmp1 = LCL_VAR simd32 | ||
// /--* tmp1 simd32 | ||
// tmp1 = * HWINTRINSIC simd16 T GetLower | ||
// tmp2 = LCL_VAR simd32 | ||
// /--* tmp2 simd16 | ||
// tmp2 = * HWINTRINSIC simd16 T GetUpper | ||
// /--* tmp1 simd16 | ||
// +--* tmp2 simd16 | ||
// node = * HWINTRINSIC simd16 T Add | ||
// tmp3 = LCL_VAR simd32 | ||
// /--* tmp2 simd32 | ||
// +--* tmp3 simd32 | ||
// +--* CNS_INT int 0x01 | ||
// tmp2 = * HWINTRINSIC simd32 T Permute | ||
// /--* tmp1 simd32 | ||
// +--* tmp2 simd32 | ||
// node = * HWINTRINSIC simd32 T Add | ||
|
||
// This is roughly the following managed code: | ||
// var tmp1 = Avx.DotProduct(op1, op2, 0xFF); | ||
// var tmp2 = tmp1.GetUpper(); | ||
// return Sse.Add(tmp1, tmp2); | ||
// var tmp2 = Avx.Permute2x128(tmp1, tmp1, 0x4E); | ||
// return Avx.Add(tmp1, tmp2); | ||
|
||
idx = comp->gtNewIconNode(0xF1, TYP_INT); | ||
idx = comp->gtNewIconNode(0xFF, TYP_INT); | ||
BlockRange().InsertBefore(node, idx); | ||
|
||
tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, simdBaseJitType, | ||
|
@@ -4500,27 +4501,30 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) | |
tmp2 = comp->gtClone(tmp1); | ||
BlockRange().InsertAfter(tmp1, tmp2); | ||
|
||
tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize); | ||
tmp3 = comp->gtClone(tmp2); | ||
BlockRange().InsertAfter(tmp2, tmp3); | ||
LowerNode(tmp3); | ||
|
||
tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize); | ||
BlockRange().InsertAfter(tmp3, tmp1); | ||
LowerNode(tmp1); | ||
idx = comp->gtNewIconNode(0x01, TYP_INT); | ||
BlockRange().InsertAfter(tmp3, idx); | ||
|
||
tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16); | ||
BlockRange().InsertAfter(tmp1, tmp2); | ||
tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, NI_AVX_Permute2x128, simdBaseJitType, | ||
simdSize); | ||
BlockRange().InsertAfter(idx, tmp2); | ||
LowerNode(tmp2); | ||
|
||
tmp1 = comp->gtNewSimdBinOpNode(GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize); | ||
BlockRange().InsertAfter(tmp2, tmp1); | ||
|
||
// We're producing a vector result, so just return the result directly | ||
LIR::Use use; | ||
|
||
if (BlockRange().TryGetUse(node, &use)) | ||
{ | ||
use.ReplaceWith(tmp2); | ||
use.ReplaceWith(tmp1); | ||
} | ||
|
||
BlockRange().Remove(node); | ||
return LowerNode(tmp2); | ||
return LowerNode(tmp1); | ||
} | ||
|
||
case TYP_DOUBLE: | ||
|
@@ -4999,21 +5003,19 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) | |
// /--* tmp1 simd32 | ||
// * STORE_LCL_VAR simd32 | ||
// tmp1 = LCL_VAR simd32 | ||
// /--* tmp1 simd32 | ||
// tmp1 = * HWINTRINSIC simd16 T GetLower | ||
// tmp2 = LCL_VAR simd32 | ||
// /--* tmp2 simd32 | ||
// tmp3 = * HWINTRINSIC simd16 T GetUpper | ||
// /--* tmp1 simd16 | ||
// +--* tmp3 simd16 | ||
// tmp1 = * HWINTRINSIC simd16 T Add | ||
// +--* CNS_INT int 0x01 | ||
// tmp2 = * HWINTRINSIC simd32 float Permute | ||
// /--* tmp1 simd32 | ||
// +--* tmp2 simd32 | ||
// tmp1 = * HWINTRINSIC simd32 T Add | ||
// ... | ||
|
||
// This is roughly the following managed code: | ||
// ... | ||
// var tmp2 = tmp1; | ||
// tmp3 = tmp2.GetUpper(); | ||
// var tmp1 = Isa.Add(tmp1.GetLower(), tmp2); | ||
// var tmp2 = Isa.Permute2x128(tmp1, tmp2, 0x01); | ||
// tmp1 = Isa.Add(tmp1, tmp2); | ||
// ... | ||
|
||
assert(simdBaseType != TYP_FLOAT); | ||
|
@@ -5026,20 +5028,21 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) | |
tmp2 = comp->gtClone(tmp1); | ||
BlockRange().InsertAfter(tmp1, tmp2); | ||
|
||
tmp3 = comp->gtNewSimdGetUpperNode(TYP_SIMD16, tmp2, simdBaseJitType, simdSize); | ||
tmp3 = comp->gtClone(tmp2); | ||
BlockRange().InsertAfter(tmp2, tmp3); | ||
LowerNode(tmp3); | ||
|
||
tmp1 = comp->gtNewSimdGetLowerNode(TYP_SIMD16, tmp1, simdBaseJitType, simdSize); | ||
BlockRange().InsertAfter(tmp3, tmp1); | ||
LowerNode(tmp1); | ||
idx = comp->gtNewIconNode(0x01, TYP_INT); | ||
BlockRange().InsertAfter(tmp3, idx); | ||
|
||
tmp2 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp3, tmp1, simdBaseJitType, 16); | ||
BlockRange().InsertAfter(tmp1, tmp2); | ||
NamedIntrinsic permute2x128 = (simdBaseType == TYP_DOUBLE) ? NI_AVX_Permute2x128 : NI_AVX2_Permute2x128; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the debug assert only checks AVX: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have an assert a bit higher up already. We could replicate it here if you really want it to be. |
||
|
||
tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, permute2x128, simdBaseJitType, simdSize); | ||
BlockRange().InsertAfter(idx, tmp2); | ||
LowerNode(tmp2); | ||
|
||
node->SetSimdSize(16); | ||
tmp1 = tmp2; | ||
tmp1 = comp->gtNewSimdBinOpNode(GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize); | ||
BlockRange().InsertAfter(tmp2, tmp1); | ||
LowerNode(tmp1); | ||
} | ||
|
||
// We're producing a vector result, so just return the result directly | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The issue was simply that we were returning a
V128
still when we should've been returning a fullV256
The old code was basically doing:
The new code instead does:
So we have the same general size of code and are simply swapping lower/upper rather than extracting upper, we then do a 256-bit add rather than a 128-bit add.
In a couple edge cases, we do end up with slightly better code due to how the register allocator handles the uses of the clones and so we see a very small -51 bytes disasm improvement.