From 8d4d8fe3e5baaba6f728352520201108725680ed Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 29 Feb 2024 19:20:21 +0000 Subject: [PATCH 1/2] FEXCore: Add non-atomic Memcpy and Memset IR fast paths When TSO is disabled, vector LDP/STP can be used for a two instruction 32 byte memory copy which is significantly faster than the current byte-by-byte copy. Performing two such copies directly after oneanother also marginally increases copy speed for all sizes >=64. --- .../Interface/Core/JIT/Arm64/MemoryOps.cpp | 123 +++++++++++++++++- 1 file changed, 119 insertions(+), 4 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index 8bfa108ddc..207646b044 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -1792,16 +1792,75 @@ DEF_OP(MemSet) { } }; + const auto SubRegSize = + Size == 1 ? ARMEmitter::SubRegSize::i8Bit : + Size == 2 ? ARMEmitter::SubRegSize::i16Bit : + Size == 4 ? ARMEmitter::SubRegSize::i32Bit : + Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; + auto EmitMemset = [&](int32_t Direction) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; // Early exit if zero count. cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + + // Fallback to byte by byte loop if not 4 byte aligned + and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + if (Direction == -1) { + sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); + } + + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the + // single copy loop if size < 64. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + + // Fill VTMP2 with the set pattern + dup(SubRegSize, VTMP2.Q(), Value); + + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + + Bind(&AgainInternal256); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbz(TMP1, 63, &AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + Bind(&AgainInternal128); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbz(TMP1, 63, &AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + if (Direction == -1) { + add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); + } + } + Bind(&AgainInternal); if (Op->IsAtomic) { MemStoreTSO(Value, OpSize, SizeDirection); @@ -1951,6 +2010,10 @@ DEF_OP(MemCpy) { ldr(TMP4, TMP3, Size); str(TMP4, TMP2, Size); break; + case 32: + ldp(VTMP1.Q(), VTMP2.Q(), TMP3, Size); + stp(VTMP1.Q(), VTMP2.Q(), TMP2, Size); + break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; @@ -2057,12 +2120,64 @@ DEF_OP(MemCpy) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; // Early exit if zero count. cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + + // Fallback to byte by byte loop if either of start/end are not 4 byte aligned + orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); + and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + if (Direction == -1) { + sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); + sub(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32 - Size); + } + + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the + // single copy loop if size < 64. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + + Bind(&AgainInternal256); + MemCpy(32, 32 * Direction); + MemCpy(32, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbz(TMP1, 63, &AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + Bind(&AgainInternal128); + MemCpy(32, 32 * Direction); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbz(TMP1, 63, &AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + if (Direction == -1) { + add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); + add(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32 - Size); + } + } + Bind(&AgainInternal); if (Op->IsAtomic) { MemCpyTSO(OpSize, SizeDirection); From 29b05f6b90d3f5f4fd12f430cbfeb80e32c734ee Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Mar 2024 23:30:19 +0000 Subject: [PATCH 2/2] Update InstCountCI --- .../InstructionCountCI/FEXOpt/MultiInst.json | 416 ++++++++++++++++- unittests/InstructionCountCI/Primary.json | 432 ++++++++++++++++-- 2 files changed, 792 insertions(+), 56 deletions(-) diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst.json b/unittests/InstructionCountCI/FEXOpt/MultiInst.json index a3b790c588..bcf564da92 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst.json @@ -216,7 +216,7 @@ ] }, "positive rep movsb": { - "ExpectedInstructionCount": 19, + "ExpectedInstructionCount": 42, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -232,6 +232,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -248,7 +271,7 @@ ] }, "positive rep movsw": { - "ExpectedInstructionCount": 19, + "ExpectedInstructionCount": 42, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -264,6 +287,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -280,7 +326,7 @@ ] }, "positive rep movsd": { - "ExpectedInstructionCount": 19, + "ExpectedInstructionCount": 42, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -296,6 +342,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -312,7 +381,7 @@ ] }, "positive rep movsq": { - "ExpectedInstructionCount": 19, + "ExpectedInstructionCount": 42, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -328,6 +397,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -344,7 +436,7 @@ ] }, "negative rep movsb": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 45, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -359,7 +451,34 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x14", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1f (31)", + "sub x2, x2, #0x1f (31)", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1f (31)", + "add x2, x2, #0x1f (31)", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -375,7 +494,7 @@ ] }, "negative rep movsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 45, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -390,7 +509,34 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x14", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1e (30)", + "sub x2, x2, #0x1e (30)", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1e (30)", + "add x2, x2, #0x1e (30)", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -406,7 +552,7 @@ ] }, "negative rep movsd": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 45, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -421,7 +567,34 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x14", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1c (28)", + "sub x2, x2, #0x1c (28)", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1c (28)", + "add x2, x2, #0x1c (28)", "ldr w3, [x2], #-4", "str w3, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -437,7 +610,7 @@ ] }, "negative rep movsq": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 45, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -452,7 +625,34 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x14", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x18 (24)", + "sub x2, x2, #0x18 (24)", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", + "cbz x0, #+0x1c", + "add x1, x1, #0x18 (24)", + "add x2, x2, #0x18 (24)", "ldr x3, [x2], #-8", "str x3, [x1], #-8", "sub x0, x0, #0x1 (1)", @@ -468,7 +668,7 @@ ] }, "positive rep stosb": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 32, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -484,6 +684,26 @@ "uxtb w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x3c", + "dup v1.16b, w21", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w21, [x1], #1", "sub x0, x0, #0x1 (1)", @@ -493,7 +713,7 @@ ] }, "positive rep stosw": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 32, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -509,6 +729,26 @@ "uxth w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x3c", + "dup v1.8h, w21", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w21, [x1], #2", "sub x0, x0, #0x1 (1)", @@ -518,7 +758,7 @@ ] }, "positive rep stosd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 32, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -534,6 +774,26 @@ "mov w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x3c", + "dup v1.4s, w21", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w21, [x1], #4", "sub x0, x0, #0x1 (1)", @@ -543,7 +803,7 @@ ] }, "positive rep stosq": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -558,6 +818,26 @@ "strb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", @@ -567,7 +847,7 @@ ] }, "negative rep stosb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 33, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -582,7 +862,29 @@ "uxtb w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x10", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1f (31)", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x14", + "add x1, x1, #0x1f (31)", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -591,7 +893,7 @@ ] }, "negative rep stosw": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 33, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -606,7 +908,29 @@ "uxth w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x10", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1e (30)", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x14", + "add x1, x1, #0x1e (30)", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -615,7 +939,7 @@ ] }, "negative rep stosd": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 33, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -630,7 +954,29 @@ "mov w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x10", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1c (28)", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x14", + "add x1, x1, #0x1c (28)", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -639,7 +985,7 @@ ] }, "negative rep stosq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 32, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -653,7 +999,29 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x10", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x18 (24)", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", + "cbz x0, #+0x14", + "add x1, x1, #0x18 (24)", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 143ae2af84..8cd7ecdefc 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -2860,14 +2860,37 @@ ] }, "rep movsb": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 79, "Comment": "0xa4", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x30", + "tbnz w20, #1, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -2878,8 +2901,35 @@ "mov x2, x5", "add x20, x0, x2", "add x21, x1, x2", - "b #+0x2c", - "cbz x0, #+0x14", + "b #+0x98", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1f (31)", + "sub x2, x2, #0x1f (31)", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1f (31)", + "add x2, x2, #0x1f (31)", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -2895,14 +2945,37 @@ ] }, "rep movsw": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 79, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x30", + "tbnz w20, #1, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -2913,8 +2986,35 @@ "mov x2, x5", "add x20, x0, x2, lsl #1", "add x21, x1, x2, lsl #1", - "b #+0x2c", - "cbz x0, #+0x14", + "b #+0x98", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1e (30)", + "sub x2, x2, #0x1e (30)", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1e (30)", + "add x2, x2, #0x1e (30)", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -2930,14 +3030,37 @@ ] }, "rep movsd": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 79, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x30", + "tbnz w20, #1, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -2948,8 +3071,35 @@ "mov x2, x5", "add x20, x0, x2, lsl #2", "add x21, x1, x2, lsl #2", - "b #+0x2c", - "cbz x0, #+0x14", + "b #+0x98", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x1c (28)", + "sub x2, x2, #0x1c (28)", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x1c", + "add x1, x1, #0x1c (28)", + "add x2, x2, #0x1c (28)", "ldr w3, [x2], #-4", "str w3, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -2965,14 +3115,37 @@ ] }, "rep movsq": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 79, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x30", + "tbnz w20, #1, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -2983,8 +3156,35 @@ "mov x2, x5", "add x20, x0, x2, lsl #3", "add x21, x1, x2, lsl #3", - "b #+0x2c", - "cbz x0, #+0x14", + "b #+0x98", + "cbz x0, #+0x80", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x64", + "sub x1, x1, #0x18 (24)", + "sub x2, x2, #0x18 (24)", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x3c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", + "cbz x0, #+0x1c", + "add x1, x1, #0x18 (24)", + "add x2, x2, #0x18 (24)", "ldr x3, [x2], #-8", "str x3, [x1], #-8", "sub x0, x0, #0x1 (1)", @@ -3341,21 +3541,63 @@ ] }, "rep stosb": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 59, "Comment": "0xaa", "ExpectedArm64ASM": [ "uxtb w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x1c", + "tbnz w21, #1, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #1", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5", - "b #+0x18", - "cbz x0, #+0x10", + "b #+0x70", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1f (31)", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x14", + "add x1, x1, #0x1f (31)", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -3364,21 +3606,63 @@ ] }, "rep stosw": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 59, "Comment": "0xab", "ExpectedArm64ASM": [ "uxth w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x1c", + "tbnz w21, #1, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #2", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #1", - "b #+0x18", - "cbz x0, #+0x10", + "b #+0x70", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1e (30)", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x14", + "add x1, x1, #0x1e (30)", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -3387,21 +3671,63 @@ ] }, "rep stosd": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 59, "Comment": "0xab", "ExpectedArm64ASM": [ "mov w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x1c", + "tbnz w21, #1, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #4", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #2", - "b #+0x18", - "cbz x0, #+0x10", + "b #+0x70", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x1c (28)", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x14", + "add x1, x1, #0x1c (28)", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", @@ -3410,7 +3736,7 @@ ] }, "rep stosq": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 58, "Comment": [ "Unrolling the loop for faster memset can be done.", "Taking advantage of ARM MOPs instructions can be done", @@ -3420,14 +3746,56 @@ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w20, #1, #+0x1c", + "tbnz w20, #1, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #3", - "b #+0x18", - "cbz x0, #+0x10", + "b #+0x70", + "cbz x0, #+0x68", + "and x3, x1, #0x3", + "cbnz x3, #+0x54", + "sub x1, x1, #0x18 (24)", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", + "cbz x0, #+0x14", + "add x1, x1, #0x18 (24)", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8",