Skip to content

Commit

Permalink
FEXCore: Add non-atomic Memcpy and Memset IR fast paths
Browse files Browse the repository at this point in the history
When TSO is disabled, vector LDP/STP can be used for a two
instruction 32 byte memory copy which is significantly faster than the
current byte-by-byte copy. Performing two such copies directly after
oneanother also marginally increases copy speed for all sizes >=64.
  • Loading branch information
bylaws committed Mar 5, 2024
1 parent 009ae55 commit 4459975
Showing 1 changed file with 100 additions and 4 deletions.
104 changes: 100 additions & 4 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1784,16 +1784,66 @@ DEF_OP(MemSet) {
}
};

const auto SubRegSize =
Size == 1 ? ARMEmitter::SubRegSize::i8Bit :
Size == 2 ? ARMEmitter::SubRegSize::i16Bit :
Size == 4 ? ARMEmitter::SubRegSize::i32Bit :
Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit;

auto EmitMemset = [&](int32_t Direction) {
const int32_t OpSize = Size;
const int32_t SizeDirection = Size * Direction;

ARMEmitter::BackwardLabel AgainInternal{};
ARMEmitter::SingleUseForwardLabel DoneInternal{};
ARMEmitter::BiDirectionalLabel AgainInternal{};
ARMEmitter::ForwardLabel DoneInternal{};

// Early exit if zero count.
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

if (!Op->IsAtomic) {
ARMEmitter::ForwardLabel AgainInternal256Exit{};
ARMEmitter::BackwardLabel AgainInternal256{};
ARMEmitter::ForwardLabel AgainInternal128Exit{};
ARMEmitter::BackwardLabel AgainInternal128{};

// Fallback to byte by byte loop if not 4 byte aligned
and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3);
cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal);

// Fill VTMP2 with the set pattern
dup(SubRegSize, VTMP2.Q(), Value);

// Keep the counter one copy ahead, so that underflow can be used to detect when to fallback
// to the copy unit size copy loop for the last chunk.
// Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the
// single copy loop if size < 64.
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal256Exit);

Bind(&AgainInternal256);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
tbz(TMP1, 63, &AgainInternal256);

Bind(&AgainInternal256Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
Bind(&AgainInternal128);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbz(TMP1, 63, &AgainInternal128);

Bind(&AgainInternal128Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
}

Bind(&AgainInternal);
if (Op->IsAtomic) {
MemStoreTSO(Value, OpSize, SizeDirection);
Expand Down Expand Up @@ -1943,6 +1993,10 @@ DEF_OP(MemCpy) {
ldr<ARMEmitter::IndexType::POST>(TMP4, TMP3, Size);
str<ARMEmitter::IndexType::POST>(TMP4, TMP2, Size);
break;
case 32:
ldp<ARMEmitter::IndexType::POST>(VTMP1.Q(), VTMP2.Q(), TMP3, Size);
stp<ARMEmitter::IndexType::POST>(VTMP1.Q(), VTMP2.Q(), TMP2, Size);
break;
default:
LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size);
break;
Expand Down Expand Up @@ -2049,11 +2103,53 @@ DEF_OP(MemCpy) {
const int32_t OpSize = Size;
const int32_t SizeDirection = Size * Direction;

ARMEmitter::BackwardLabel AgainInternal{};
ARMEmitter::SingleUseForwardLabel DoneInternal{};
ARMEmitter::BiDirectionalLabel AgainInternal{};
ARMEmitter::ForwardLabel DoneInternal{};

// Early exit if zero count.
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3);

if (!Op->IsAtomic) {
ARMEmitter::ForwardLabel AgainInternal256Exit{};
ARMEmitter::ForwardLabel AgainInternal128Exit{};
ARMEmitter::BackwardLabel AgainInternal128{};
ARMEmitter::BackwardLabel AgainInternal256{};

// Fallback to byte by byte loop if either of start/end are not 4 byte aligned
and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3);
cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal);

// Keep the counter one copy ahead, so that underflow can be used to detect when to fallback
// to the copy unit size copy loop for the last chunk.
// Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the
// single copy loop if size < 64.
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal256Exit);

Bind(&AgainInternal256);
MemCpy(32, 32 * Direction);
MemCpy(32, 32 * Direction);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
tbz(TMP1, 63, &AgainInternal256);

Bind(&AgainInternal256Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
Bind(&AgainInternal128);
MemCpy(32, 32 * Direction);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbz(TMP1, 63, &AgainInternal128);

Bind(&AgainInternal128Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
}

Bind(&AgainInternal);
if (Op->IsAtomic) {
Expand Down

0 comments on commit 4459975

Please sign in to comment.