From 4459975194596c534faa165ee3a44ca840d04728 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 29 Feb 2024 19:20:21 +0000 Subject: [PATCH] FEXCore: Add non-atomic Memcpy and Memset IR fast paths When TSO is disabled, vector LDP/STP can be used for a two instruction 32 byte memory copy which is significantly faster than the current byte-by-byte copy. Performing two such copies directly after oneanother also marginally increases copy speed for all sizes >=64. --- .../Interface/Core/JIT/Arm64/MemoryOps.cpp | 104 +++++++++++++++++- 1 file changed, 100 insertions(+), 4 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index 40472ac20a..9f3e62696f 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -1784,16 +1784,66 @@ DEF_OP(MemSet) { } }; + const auto SubRegSize = + Size == 1 ? ARMEmitter::SubRegSize::i8Bit : + Size == 2 ? ARMEmitter::SubRegSize::i16Bit : + Size == 4 ? ARMEmitter::SubRegSize::i32Bit : + Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; + auto EmitMemset = [&](int32_t Direction) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; // Early exit if zero count. cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + + // Fallback to byte by byte loop if not 4 byte aligned + and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + // Fill VTMP2 with the set pattern + dup(SubRegSize, VTMP2.Q(), Value); + + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the + // single copy loop if size < 64. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + + Bind(&AgainInternal256); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbz(TMP1, 63, &AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + Bind(&AgainInternal128); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbz(TMP1, 63, &AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + } + Bind(&AgainInternal); if (Op->IsAtomic) { MemStoreTSO(Value, OpSize, SizeDirection); @@ -1943,6 +1993,10 @@ DEF_OP(MemCpy) { ldr(TMP4, TMP3, Size); str(TMP4, TMP2, Size); break; + case 32: + ldp(VTMP1.Q(), VTMP2.Q(), TMP3, Size); + stp(VTMP1.Q(), VTMP2.Q(), TMP2, Size); + break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; @@ -2049,11 +2103,53 @@ DEF_OP(MemCpy) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; // Early exit if zero count. cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); + + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + + // Fallback to byte by byte loop if either of start/end are not 4 byte aligned + and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the + // single copy loop if size < 64. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + + Bind(&AgainInternal256); + MemCpy(32, 32 * Direction); + MemCpy(32, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbz(TMP1, 63, &AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + Bind(&AgainInternal128); + MemCpy(32, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbz(TMP1, 63, &AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + } Bind(&AgainInternal); if (Op->IsAtomic) {