From 61a05ee2b4483d1475bda43e7f9f1395b53863a3 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 27 Aug 2023 12:56:53 +0200 Subject: [PATCH 1/2] Use AVX512 to zero locals --- src/coreclr/jit/codegenxarch.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5847b40ffd32e..42e037c8a3bf5 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -10894,9 +10894,12 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu assert((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo)); #endif // !defined(TARGET_AMD64) + const int maxSimdSize = (int)compiler->roundDownSIMDSize(blkSize); + assert((maxSimdSize >= XMM_REGSIZE_BYTES) && (maxSimdSize <= ZMM_REGSIZE_BYTES)); + // The loop is unrolled 3 times so we do not move to the loop block until it // will loop at least once so the threshold is 6. - if (blkSize < (6 * XMM_REGSIZE_BYTES)) + if (blkSize < (6 * maxSimdSize)) { // Generate the following code: // @@ -10905,10 +10908,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu // ... // movups xmmword ptr [ebp/esp-OFFS], xmm4 // mov qword ptr [ebp/esp-OFFS], rax - + // + // NOTE: it implicitly zeroes YMM4 and ZMM4 as well. emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, zeroSIMDReg, zeroSIMDReg, zeroSIMDReg); int i = 0; + if (maxSimdSize > XMM_REGSIZE_BYTES) + { + for (; i < blkSize - maxSimdSize; i += maxSimdSize) + { + // We previously aligned data to 16 bytes which might not be aligned to maxSimdSize + emit->emitIns_AR_R(simdUnalignedMovIns(), EA_ATTR(maxSimdSize), zeroSIMDReg, frameReg, + alignedLclLo + i); + } + } + for (; i < blkSize; i += XMM_REGSIZE_BYTES) { emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i); From 86754831cacaa351af45b4c0c76824adfc860b09 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 27 Aug 2023 15:24:11 +0200 Subject: [PATCH 2/2] Fix loop --- src/coreclr/jit/codegenxarch.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 42e037c8a3bf5..2cb7b2d094155 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -10915,12 +10915,13 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu int i = 0; if (maxSimdSize > XMM_REGSIZE_BYTES) { - for (; i < blkSize - maxSimdSize; i += maxSimdSize) + for (; i <= blkSize - maxSimdSize; i += maxSimdSize) { // We previously aligned data to 16 bytes which might not be aligned to maxSimdSize emit->emitIns_AR_R(simdUnalignedMovIns(), EA_ATTR(maxSimdSize), zeroSIMDReg, frameReg, alignedLclLo + i); } + // Remainder will be handled by the xmm loop below } for (; i < blkSize; i += XMM_REGSIZE_BYTES)