From 89f6bfe08ff4402bab9c313f6c7d823e30e36ec4 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 15 Jul 2024 15:28:35 -0700 Subject: [PATCH 1/3] Ensure that TernaryLogic lowering accounts for AND_NOT since it is not commutative --- src/coreclr/jit/lowerxarch.cpp | 95 ++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6bf852a49fb38..f3aab413cfbf8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1487,24 +1487,55 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) BlockRange().Remove(node); op3 = userIntrin->Op(2); + // Tracks which two operands get used first + TernaryLogicUseFlags firstOpUseFlags = TernaryLogicUseFlags::AB; + if (op3 == node) { - op3 = userIntrin->Op(1); + if (userOper == GT_AND_NOT) + { + op3 = op2; + op2 = op1; + op1 = userIntrin->Op(1); + + // AND_NOT isn't commutative so we need to shift parameters down + firstOpUseFlags = TernaryLogicUseFlags::BC; + } + else + { + op3 = userIntrin->Op(1); + } } uint8_t controlByte = 0x00; if ((userOper == GT_XOR) && op3->IsVectorAllBitsSet()) { - // We're being used by what is actually GT_NOT, so we - // need to shift parameters down so that A is unused + // We have XOR(OP(A, B), AllBitsSet) + // A: op1 + // B: op2 + // C: op3 (AllBitsSet) + // + // We want A to be the unused parameter so swap it around + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 + // + // This gives us NOT(OP(B, C)) + + assert(firstOpUseFlags == TernaryLogicUseFlags::AB); std::swap(op2, op3); std::swap(op1, op2); if (isOperNot) { - // We have what is actually a double not, so just return op2 + // We have NOT(XOR(B, AllBitsSet)) + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 (AllBitsSet) + // + // This represents a double not, so so just return op2 // which is the only actual value now that the parameters // were shifted around @@ -1538,20 +1569,64 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } else if (isOperNot) { - // A is unused, so we just want OP(NOT(B), C) + if (firstOpUseFlags == TernaryLogicUseFlags::AB) + { + // We have OP(XOR(A, AllBitsSet), C) + // A: op1 + // B: op2 (AllBitsSet) + // C: op3 + // + // We want A to be the unused parameter so swap it around + // A: op2 (AllBitsSet) + // B: op1 + // C: op3 + // + // This gives us OP(NOT(B), C) - assert(op2->IsVectorAllBitsSet()); - std::swap(op1, op2); + assert(op2->IsVectorAllBitsSet()); + std::swap(op1, op2); - controlByte = static_cast(~B); - controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + controlByte = static_cast(~B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + else + { + // We have OP(A, XOR(B, AllBitsSet)) + // A: op1 + // B: op2 + // C: op3 (AllBitsSet) + // + // We want A to be the unused parameter so swap it around + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 + // + // This gives us OP(B, NOT(C)) + + assert(firstOpUseFlags == TernaryLogicUseFlags::BC); + + assert(op3->IsVectorAllBitsSet()); + std::swap(op2, op3); + std::swap(op1, op2); + + controlByte = static_cast(~C); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, B, controlByte); + } } - else + else if (firstOpUseFlags == TernaryLogicUseFlags::AB) { // We have OP2(OP1(A, B), C) controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, A, B); controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); } + else + { + // We have OP2(A, OP1(B, C)) + assert(firstOpUseFlags == TernaryLogicUseFlags::BC); + + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, B, C); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, A, controlByte); + } NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; From cfbf2ae091c3b88bc1bdb796c53f37753023d8a2 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 15 Jul 2024 17:30:09 -0700 Subject: [PATCH 2/3] Ensure we create a valid node when DOTNET_EnableSSE2=0 is set --- src/coreclr/jit/hwintrinsicxarch.cpp | 19 +++++++++++++++++-- src/coreclr/jit/lowerxarch.cpp | 20 +++++++++++++++++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 348291ece8ce6..424551b51f3e1 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1413,8 +1413,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); - retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + if (IsBaselineSimdIsaSupported()) + { + op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + } + else + { + // We need to ensure we import even if SSE2 is disabled + assert(intrinsic == NI_SSE_AndNot); + + op3 = gtNewAllBitsSetConNode(retType); + + op1 = gtNewSimdHWIntrinsicNode(retType, op1, op3, NI_SSE_Xor, simdBaseJitType, simdSize); + op1 = gtFoldExpr(op1); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, NI_SSE_And, simdBaseJitType, simdSize); + } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f3aab413cfbf8..3c4b257a06daa 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1471,9 +1471,23 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) op2 = userIntrin->Op(1); } - NamedIntrinsic intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, simdBaseType, - simdSize, false); + NamedIntrinsic intrinsic = NI_Illegal; + + if (comp->IsBaselineSimdIsaSupported()) + { + intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, + simdBaseType, simdSize, false); + } + else + { + // We need to ensure we optimize even if SSE2 is disabled + + assert(simdBaseType == TYP_FLOAT); + assert(simdSize <= 16); + + intrinsic = NI_SSE_AndNot; + } + userIntrin->ResetHWIntrinsicId(intrinsic, comp, op1, op2); return nextNode; From dbfd3ab4c75444cd275956faaf102ae96b7f8574 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 16 Jul 2024 08:21:50 -0700 Subject: [PATCH 3/3] Apply formatting patch --- src/coreclr/jit/hwintrinsicxarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 424551b51f3e1..973c7821fa227 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1423,10 +1423,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // We need to ensure we import even if SSE2 is disabled assert(intrinsic == NI_SSE_AndNot); - op3 = gtNewAllBitsSetConNode(retType); + op3 = gtNewAllBitsSetConNode(retType); - op1 = gtNewSimdHWIntrinsicNode(retType, op1, op3, NI_SSE_Xor, simdBaseJitType, simdSize); - op1 = gtFoldExpr(op1); + op1 = gtNewSimdHWIntrinsicNode(retType, op1, op3, NI_SSE_Xor, simdBaseJitType, simdSize); + op1 = gtFoldExpr(op1); retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, NI_SSE_And, simdBaseJitType, simdSize); }