diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 348291ece8ce6..973c7821fa227 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1413,8 +1413,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); - retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + if (IsBaselineSimdIsaSupported()) + { + op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize)); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + } + else + { + // We need to ensure we import even if SSE2 is disabled + assert(intrinsic == NI_SSE_AndNot); + + op3 = gtNewAllBitsSetConNode(retType); + + op1 = gtNewSimdHWIntrinsicNode(retType, op1, op3, NI_SSE_Xor, simdBaseJitType, simdSize); + op1 = gtFoldExpr(op1); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, NI_SSE_And, simdBaseJitType, simdSize); + } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6bf852a49fb38..3c4b257a06daa 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1471,9 +1471,23 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) op2 = userIntrin->Op(1); } - NamedIntrinsic intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, simdBaseType, - simdSize, false); + NamedIntrinsic intrinsic = NI_Illegal; + + if (comp->IsBaselineSimdIsaSupported()) + { + intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_AND_NOT, op1, op2, + simdBaseType, simdSize, false); + } + else + { + // We need to ensure we optimize even if SSE2 is disabled + + assert(simdBaseType == TYP_FLOAT); + assert(simdSize <= 16); + + intrinsic = NI_SSE_AndNot; + } + userIntrin->ResetHWIntrinsicId(intrinsic, comp, op1, op2); return nextNode; @@ -1487,24 +1501,55 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) BlockRange().Remove(node); op3 = userIntrin->Op(2); + // Tracks which two operands get used first + TernaryLogicUseFlags firstOpUseFlags = TernaryLogicUseFlags::AB; + if (op3 == node) { - op3 = userIntrin->Op(1); + if (userOper == GT_AND_NOT) + { + op3 = op2; + op2 = op1; + op1 = userIntrin->Op(1); + + // AND_NOT isn't commutative so we need to shift parameters down + firstOpUseFlags = TernaryLogicUseFlags::BC; + } + else + { + op3 = userIntrin->Op(1); + } } uint8_t controlByte = 0x00; if ((userOper == GT_XOR) && op3->IsVectorAllBitsSet()) { - // We're being used by what is actually GT_NOT, so we - // need to shift parameters down so that A is unused + // We have XOR(OP(A, B), AllBitsSet) + // A: op1 + // B: op2 + // C: op3 (AllBitsSet) + // + // We want A to be the unused parameter so swap it around + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 + // + // This gives us NOT(OP(B, C)) + + assert(firstOpUseFlags == TernaryLogicUseFlags::AB); std::swap(op2, op3); std::swap(op1, op2); if (isOperNot) { - // We have what is actually a double not, so just return op2 + // We have NOT(XOR(B, AllBitsSet)) + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 (AllBitsSet) + // + // This represents a double not, so so just return op2 // which is the only actual value now that the parameters // were shifted around @@ -1538,20 +1583,64 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } else if (isOperNot) { - // A is unused, so we just want OP(NOT(B), C) + if (firstOpUseFlags == TernaryLogicUseFlags::AB) + { + // We have OP(XOR(A, AllBitsSet), C) + // A: op1 + // B: op2 (AllBitsSet) + // C: op3 + // + // We want A to be the unused parameter so swap it around + // A: op2 (AllBitsSet) + // B: op1 + // C: op3 + // + // This gives us OP(NOT(B), C) - assert(op2->IsVectorAllBitsSet()); - std::swap(op1, op2); + assert(op2->IsVectorAllBitsSet()); + std::swap(op1, op2); - controlByte = static_cast(~B); - controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + controlByte = static_cast(~B); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); + } + else + { + // We have OP(A, XOR(B, AllBitsSet)) + // A: op1 + // B: op2 + // C: op3 (AllBitsSet) + // + // We want A to be the unused parameter so swap it around + // A: op3 (AllBitsSet) + // B: op1 + // C: op2 + // + // This gives us OP(B, NOT(C)) + + assert(firstOpUseFlags == TernaryLogicUseFlags::BC); + + assert(op3->IsVectorAllBitsSet()); + std::swap(op2, op3); + std::swap(op1, op2); + + controlByte = static_cast(~C); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, B, controlByte); + } } - else + else if (firstOpUseFlags == TernaryLogicUseFlags::AB) { // We have OP2(OP1(A, B), C) controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, A, B); controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, controlByte, C); } + else + { + // We have OP2(A, OP1(B, C)) + assert(firstOpUseFlags == TernaryLogicUseFlags::BC); + + controlByte = TernaryLogicInfo::GetTernaryControlByte(oper, B, C); + controlByte = TernaryLogicInfo::GetTernaryControlByte(userOper, A, controlByte); + } NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic;