diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/EncryptionOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/EncryptionOps.cpp index 0276c3306b..ab6f59c698 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/EncryptionOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/EncryptionOps.cpp @@ -201,7 +201,7 @@ DEF_OP(VSha256U0) { else { mov(VTMP1.Q(), Src1.Q()); sha256su0(VTMP1, Src2); - mov(Dst.Q(), Src1.Q()); + mov(Dst.Q(), VTMP1.Q()); } } diff --git a/FEXCore/Source/Interface/IR/Passes/DeadContextStoreElimination.cpp b/FEXCore/Source/Interface/IR/Passes/DeadContextStoreElimination.cpp index 8219e11783..ca95b8333f 100644 --- a/FEXCore/Source/Interface/IR/Passes/DeadContextStoreElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/DeadContextStoreElimination.cpp @@ -547,10 +547,20 @@ bool RCLSE::ClassifyContextLoad(FEXCore::IR::IREmitter *IREmit, ContextInfo *Loc bool RCLSE::ClassifyContextStore(FEXCore::IR::IREmitter *IREmit, ContextInfo *LocalInfo, FEXCore::IR::RegisterClassType Class, uint32_t Offset, uint8_t Size, FEXCore::IR::OrderedNode *CodeNode, FEXCore::IR::OrderedNode *ValueNode) { auto Info = FindMemberInfo(LocalInfo, Offset, Size); + ContextMemberInfo PreviousMemberInfoCopy = *Info; RecordAccess(Info, Class, Offset, Size, LastAccessType::WRITE, ValueNode, CodeNode); - // TODO: Optimize redundant stores. - // ContextMemberInfo PreviousMemberInfoCopy = *Info; + + if (PreviousMemberInfoCopy.AccessRegClass == Info->AccessRegClass && + PreviousMemberInfoCopy.AccessOffset == Info->AccessOffset && + PreviousMemberInfoCopy.AccessSize == Size && + PreviousMemberInfoCopy.Accessed == LastAccessType::WRITE) { + // This optimizes redundant stores with no intervening load + IREmit->Remove(PreviousMemberInfoCopy.StoreNode); + return true; + } + + // TODO: Optimize the case of partial stores. return false; } diff --git a/unittests/ASM/Disabled_Tests_Simulator b/unittests/ASM/Disabled_Tests_Simulator index e802ebc5c6..7320481074 100644 --- a/unittests/ASM/Disabled_Tests_Simulator +++ b/unittests/ASM/Disabled_Tests_Simulator @@ -83,3 +83,6 @@ Test_VEX/vroundpd.asm Test_VEX/vroundps.asm Test_VEX/vroundsd.asm Test_VEX/vroundss.asm + +# Simulator doesn't support cycle counter reading +Test_TwoByte/0F_31.asm diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst.json b/unittests/InstructionCountCI/FEXOpt/MultiInst.json index b600f7110d..a3b790c588 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst.json @@ -14,7 +14,7 @@ ], "Instructions": { "push ax, bx": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 4, "Comment": [ "Mergable 16-bit pushes. May or may not be an optimization." ], @@ -23,12 +23,14 @@ "push bx" ], "ExpectedArm64ASM": [ - "strh w4, [x8, #-2]!", + "mov x20, x8", + "strh w4, [x20, #-2]!", + "mov x8, x20", "strh w7, [x8, #-2]!" ] }, "push rax, rbx": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 4, "Comment": [ "Mergable 64-bit pushes" ], @@ -37,12 +39,14 @@ "push rbx" ], "ExpectedArm64ASM": [ - "str x4, [x8, #-8]!", + "mov x20, x8", + "str x4, [x20, #-8]!", + "mov x8, x20", "str x7, [x8, #-8]!" ] }, "adds xmm0, xmm1, xmm2": { - "ExpectedInstructionCount": 4, + "ExpectedInstructionCount": 6, "Comment": [ "Redundant scalar adds that can get eliminated without AFP." ], @@ -51,9 +55,11 @@ "addss xmm0, xmm2" ], "ExpectedArm64ASM": [ + "mov v2.16b, v16.16b", "fadd s0, s16, s17", - "mov v16.s[0], v0.s[0]", - "fadd s0, s16, s18", + "mov v2.s[0], v0.s[0]", + "mov v16.16b, v2.16b", + "fadd s0, s2, s18", "mov v16.s[0], v0.s[0]" ] }, diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst_AFP.json b/unittests/InstructionCountCI/FEXOpt/MultiInst_AFP.json index 5644759ae7..308651ff4a 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst_AFP.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst_AFP.json @@ -15,7 +15,7 @@ ], "Instructions": { "adds xmm0, xmm1, xmm2": { - "ExpectedInstructionCount": 2, + "ExpectedInstructionCount": 4, "Comment": [ "Redundant scalar operations should get eliminated with AFP" ], @@ -24,8 +24,10 @@ "addss xmm0, xmm2" ], "ExpectedArm64ASM": [ - "fadd s16, s16, s17", - "fadd s16, s16, s18" + "mov v2.16b, v16.16b", + "fadd s2, s16, s17", + "mov v16.16b, v2.16b", + "fadd s16, s2, s18" ] } } diff --git a/unittests/InstructionCountCI/FEXOpt/libnss.json b/unittests/InstructionCountCI/FEXOpt/libnss.json index d3fc751f99..6ca42a9feb 100644 --- a/unittests/InstructionCountCI/FEXOpt/libnss.json +++ b/unittests/InstructionCountCI/FEXOpt/libnss.json @@ -16,7 +16,7 @@ "Comment": [], "Instructions": { "libnss3 sha": { - "ExpectedInstructionCount": 2391, + "ExpectedInstructionCount": 2291, "Comment": [ "This block of code comes from libnss3 which causes panic spilling in FEX's RA.", "This code is hit in steamwebhelper calling in to this function.", @@ -193,40 +193,38 @@ "movups [rdi+0x110], xmm4" ], "ExpectedArm64ASM": [ - "ldr q18, [x11, #256]", - "ldr q19, [x11, #272]", - "ldr q24, [x11]", - "ldr q23, [x11, #16]", + "ldr q2, [x11, #256]", + "ldr q3, [x11, #272]", + "ldr q4, [x11]", + "ldr q5, [x11, #16]", "ldr x0, [x28, #1688]", - "ldr q2, [x0, #2832]", - "tbl v16.16b, {v18.16b}, v2.16b", + "ldr q6, [x0, #2832]", + "tbl v2.16b, {v2.16b}, v6.16b", "ldr x0, [x28, #1688]", - "ldr q3, [x0, #432]", - "tbl v18.16b, {v19.16b}, v3.16b", - "ldr q22, [x11, #32]", - "ldr q21, [x11, #48]", - "mov v19.16b, v16.16b", - "ext v19.16b, v18.16b, v16.16b, #8", - "mov v18.d[1], v16.d[1]", + "ldr q7, [x0, #432]", + "tbl v3.16b, {v3.16b}, v7.16b", + "ldr q8, [x11, #32]", + "ldr q9, [x11, #48]", + "ext v19.16b, v3.16b, v2.16b, #8", + "mov v18.16b, v3.16b", + "mov v18.d[1], v2.d[1]", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov v20.16b, v18.16b", - "mov v17.16b, v19.16b", - "movi v4.16b, #0x8f", - "and v4.16b, v16.16b, v4.16b", - "tbl v21.16b, {v21.16b}, v4.16b", - "movi v4.16b, #0x8f", - "and v4.16b, v16.16b, v4.16b", - "tbl v22.16b, {v22.16b}, v4.16b", - "movi v4.16b, #0x8f", - "and v4.16b, v16.16b, v4.16b", - "tbl v23.16b, {v23.16b}, v4.16b", - "movi v4.16b, #0x8f", - "and v4.16b, v16.16b, v4.16b", - "tbl v24.16b, {v24.16b}, v4.16b", - "ldr q16, [x29, x20, sxtx]", - "add v16.4s, v16.4s, v24.4s", + "ldr q2, [x29, x20, sxtx]", + "movi v3.16b, #0x8f", + "and v3.16b, v2.16b, v3.16b", + "tbl v3.16b, {v9.16b}, v3.16b", + "movi v9.16b, #0x8f", + "and v9.16b, v2.16b, v9.16b", + "tbl v8.16b, {v8.16b}, v9.16b", + "movi v9.16b, #0x8f", + "and v9.16b, v2.16b, v9.16b", + "tbl v5.16b, {v5.16b}, v9.16b", + "movi v9.16b, #0x8f", + "and v2.16b, v2.16b, v9.16b", + "tbl v2.16b, {v4.16b}, v2.16b", + "ldr q4, [x29, x20, sxtx]", + "add v4.4s, v4.4s, v2.4s", "unimplemented (Unimplemented)", "mov w20, v19.s[1]", "mov w21, v19.s[0]", @@ -238,7 +236,7 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", "mov w23, v18.s[0]", "add w22, w22, w23", @@ -263,7 +261,7 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "mov w21, v18.s[1]", "add w20, w20, w21", @@ -278,17 +276,16 @@ "add w21, w21, w23", "mov w23, v18.s[3]", "add w20, w20, w23", - "mov v4.16b, v18.16b", - "mov v4.s[3], w21", - "mov v4.s[2], w25", - "mov v4.s[1], w20", - "mov v20.16b, v4.16b", - "mov v20.s[0], w22", + "mov v9.16b, v18.16b", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", "ldr x0, [x28, #1688]", - "ldr q4, [x0, #224]", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", + "ldr q10, [x0, #224]", + "tbl v4.16b, {v4.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", "mov w22, v19.s[1]", "and w23, w20, w21", "bic w22, w22, w20", @@ -297,12 +294,12 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", "mov w23, v19.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", "mov w25, v19.s[3]", "and w30, w24, w25", "orr w25, w24, w25", @@ -322,7 +319,7 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "mov w21, v19.s[1]", "add w20, w20, w21", @@ -337,59 +334,55 @@ "add w21, w21, w23", "mov w23, v19.s[3]", "add w20, w20, w23", - "mov v5.16b, v19.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v21.16b", - "ext v16.16b, v22.16b, v21.16b, #4", - "add v24.4s, v24.4s, v16.4s", + "mov v4.16b, v19.16b", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v8.16b, v3.16b, #4", + "add v2.4s, v2.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v21.s[2]", - "mov w21, v21.s[3]", - "mov w22, v24.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v3.s[2]", + "mov w21, v3.s[3]", + "mov w22, v2.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v24.s[1]", + "mov w22, v2.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v24.s[2]", + "mov w22, v2.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v24.s[3]", + "mov w23, v2.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v24.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v24.16b, v5.16b", - "mov v24.s[0], w20", - "add v16.4s, v16.4s, v23.4s", + "mov v2.s[3], w23", + "mov v2.s[2], w22", + "mov v2.s[1], w21", + "mov v2.s[0], w20", + "add v11.4s, v11.4s, v5.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -397,13 +390,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -413,7 +406,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -422,9 +415,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -435,18 +428,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -454,13 +445,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -470,7 +461,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -479,9 +470,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -492,61 +483,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v24.16b", - "ext v16.16b, v21.16b, v24.16b, #4", - "add v23.4s, v23.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v3.16b, v2.16b, #4", + "add v5.4s, v5.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v24.s[2]", - "mov w21, v24.s[3]", - "mov w22, v23.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v2.s[2]", + "mov w21, v2.s[3]", + "mov w22, v5.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v23.s[1]", + "mov w22, v5.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v23.s[2]", + "mov w22, v5.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v23.s[3]", + "mov w23, v5.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v23.16b", "mov v5.s[3], w23", "mov v5.s[2], w22", "mov v5.s[1], w21", - "mov v23.16b, v5.16b", - "mov v23.s[0], w20", - "add v16.4s, v16.4s, v22.4s", + "mov v5.s[0], w20", + "add v11.4s, v11.4s, v8.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -554,13 +540,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -570,7 +556,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -579,9 +565,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -592,18 +578,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -611,13 +595,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -627,7 +611,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -636,9 +620,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -649,61 +633,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v23.16b", - "ext v16.16b, v24.16b, v23.16b, #4", - "add v22.4s, v22.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v2.16b, v5.16b, #4", + "add v8.4s, v8.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v23.s[2]", - "mov w21, v23.s[3]", - "mov w22, v22.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v5.s[2]", + "mov w21, v5.s[3]", + "mov w22, v8.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v22.s[1]", + "mov w22, v8.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v22.s[2]", + "mov w22, v8.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v22.s[3]", + "mov w23, v8.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v22.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v22.16b, v5.16b", - "mov v22.s[0], w20", - "add v16.4s, v16.4s, v21.4s", + "mov v8.s[3], w23", + "mov v8.s[2], w22", + "mov v8.s[1], w21", + "mov v8.s[0], w20", + "add v11.4s, v11.4s, v3.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -711,13 +690,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -727,7 +706,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -736,9 +715,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -749,18 +728,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -768,13 +745,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -784,7 +761,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -793,9 +770,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -806,61 +783,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v22.16b", - "ext v16.16b, v23.16b, v22.16b, #4", - "add v21.4s, v21.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v5.16b, v8.16b, #4", + "add v3.4s, v3.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v22.s[2]", - "mov w21, v22.s[3]", - "mov w22, v21.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v8.s[2]", + "mov w21, v8.s[3]", + "mov w22, v3.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v21.s[1]", + "mov w22, v3.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v21.s[2]", + "mov w22, v3.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v21.s[3]", + "mov w23, v3.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v21.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v21.16b, v5.16b", - "mov v21.s[0], w20", - "add v16.4s, v16.4s, v24.4s", + "mov v3.s[3], w23", + "mov v3.s[2], w22", + "mov v3.s[1], w21", + "mov v3.s[0], w20", + "add v11.4s, v11.4s, v2.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -868,13 +840,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -884,7 +856,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -893,9 +865,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -906,18 +878,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -925,13 +895,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -941,7 +911,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -950,9 +920,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -963,61 +933,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v21.16b", - "ext v16.16b, v22.16b, v21.16b, #4", - "add v24.4s, v24.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v8.16b, v3.16b, #4", + "add v2.4s, v2.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v21.s[2]", - "mov w21, v21.s[3]", - "mov w22, v24.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v3.s[2]", + "mov w21, v3.s[3]", + "mov w22, v2.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v24.s[1]", + "mov w22, v2.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v24.s[2]", + "mov w22, v2.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v24.s[3]", + "mov w23, v2.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v24.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v24.16b, v5.16b", - "mov v24.s[0], w20", - "add v16.4s, v16.4s, v23.4s", + "mov v2.s[3], w23", + "mov v2.s[2], w22", + "mov v2.s[1], w21", + "mov v2.s[0], w20", + "add v11.4s, v11.4s, v5.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1025,13 +990,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1041,7 +1006,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1050,9 +1015,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1063,18 +1028,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1082,13 +1045,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1098,7 +1061,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1107,9 +1070,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1120,61 +1083,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v24.16b", - "ext v16.16b, v21.16b, v24.16b, #4", - "add v23.4s, v23.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v3.16b, v2.16b, #4", + "add v5.4s, v5.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v24.s[2]", - "mov w21, v24.s[3]", - "mov w22, v23.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v2.s[2]", + "mov w21, v2.s[3]", + "mov w22, v5.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v23.s[1]", + "mov w22, v5.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v23.s[2]", + "mov w22, v5.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v23.s[3]", + "mov w23, v5.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v23.16b", "mov v5.s[3], w23", "mov v5.s[2], w22", "mov v5.s[1], w21", - "mov v23.16b, v5.16b", - "mov v23.s[0], w20", - "add v16.4s, v16.4s, v22.4s", + "mov v5.s[0], w20", + "add v11.4s, v11.4s, v8.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1182,13 +1140,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1198,7 +1156,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1207,9 +1165,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1220,18 +1178,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1239,13 +1195,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1255,7 +1211,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1264,9 +1220,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1277,61 +1233,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v23.16b", - "ext v16.16b, v24.16b, v23.16b, #4", - "add v22.4s, v22.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v2.16b, v5.16b, #4", + "add v8.4s, v8.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v23.s[2]", - "mov w21, v23.s[3]", - "mov w22, v22.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v5.s[2]", + "mov w21, v5.s[3]", + "mov w22, v8.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v22.s[1]", + "mov w22, v8.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v22.s[2]", + "mov w22, v8.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v22.s[3]", + "mov w23, v8.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v22.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v22.16b, v5.16b", - "mov v22.s[0], w20", - "add v16.4s, v16.4s, v21.4s", + "mov v8.s[3], w23", + "mov v8.s[2], w22", + "mov v8.s[1], w21", + "mov v8.s[0], w20", + "add v11.4s, v11.4s, v3.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1339,13 +1290,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1355,7 +1306,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1364,9 +1315,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1377,18 +1328,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1396,13 +1345,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1412,7 +1361,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1421,9 +1370,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1434,61 +1383,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v22.16b", - "ext v16.16b, v23.16b, v22.16b, #4", - "add v21.4s, v21.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v5.16b, v8.16b, #4", + "add v3.4s, v3.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v22.s[2]", - "mov w21, v22.s[3]", - "mov w22, v21.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v8.s[2]", + "mov w21, v8.s[3]", + "mov w22, v3.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v21.s[1]", + "mov w22, v3.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v21.s[2]", + "mov w22, v3.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v21.s[3]", + "mov w23, v3.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v21.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v21.16b, v5.16b", - "mov v21.s[0], w20", - "add v16.4s, v16.4s, v24.4s", + "mov v3.s[3], w23", + "mov v3.s[2], w22", + "mov v3.s[1], w21", + "mov v3.s[0], w20", + "add v11.4s, v11.4s, v2.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1496,13 +1440,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1512,7 +1456,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1521,9 +1465,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1534,18 +1478,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v11.16b, {v11.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1553,13 +1495,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v11.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1569,7 +1511,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1578,9 +1520,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v11.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1591,61 +1533,57 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v21.16b", - "ext v16.16b, v22.16b, v21.16b, #4", - "add v24.4s, v24.4s, v16.4s", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "ext v11.16b, v8.16b, v3.16b, #4", + "add v2.4s, v2.4s, v11.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v21.s[2]", - "mov w21, v21.s[3]", - "mov w22, v24.s[0]", + "ldr q11, [x29, x20, sxtx]", + "mov w20, v3.s[2]", + "mov w21, v3.s[3]", + "mov w22, v2.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v24.s[1]", + "mov w22, v2.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v24.s[2]", + "mov w22, v2.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v24.s[3]", + "mov w23, v2.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v24.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v24.16b, v5.16b", + "mov v2.s[3], w23", + "mov v2.s[2], w22", + "mov v2.s[1], w21", + "mov v24.16b, v2.16b", "mov v24.s[0], w20", - "add v16.4s, v16.4s, v23.4s", + "add v2.4s, v11.4s, v5.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1653,13 +1591,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1669,7 +1607,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1678,9 +1616,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1691,18 +1629,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v2.16b, {v2.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1710,13 +1646,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1726,7 +1662,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1735,9 +1671,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1748,61 +1684,57 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v24.16b", - "ext v16.16b, v21.16b, v24.16b, #4", - "add v23.4s, v23.4s, v16.4s", + "mov v2.16b, v4.16b", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "ext v4.16b, v3.16b, v24.16b, #4", + "add v4.4s, v5.4s, v4.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", + "ldr q5, [x29, x20, sxtx]", "mov w20, v24.s[2]", "mov w21, v24.s[3]", - "mov w22, v23.s[0]", + "mov w22, v4.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v23.s[1]", + "mov w22, v4.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v23.s[2]", + "mov w22, v4.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v23.s[3]", + "mov w23, v4.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v23.16b", - "mov v5.s[3], w23", - "mov v5.s[2], w22", - "mov v5.s[1], w21", - "mov v23.16b, v5.16b", - "mov v23.s[0], w20", - "add v16.4s, v16.4s, v22.4s", + "mov v4.s[3], w23", + "mov v4.s[2], w22", + "mov v4.s[1], w21", + "mov v4.s[0], w20", + "add v5.4s, v5.4s, v8.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1810,13 +1742,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v5.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1826,7 +1758,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1835,9 +1767,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v5.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1848,18 +1780,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v5.16b, {v5.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1867,13 +1797,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v5.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1883,7 +1813,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1892,9 +1822,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v5.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -1905,61 +1835,56 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v23.16b", - "ext v16.16b, v24.16b, v23.16b, #4", - "add v22.4s, v22.4s, v16.4s", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "ext v5.16b, v24.16b, v4.16b, #4", + "add v5.4s, v8.4s, v5.4s", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q16, [x29, x20, sxtx]", - "mov w20, v23.s[2]", - "mov w21, v23.s[3]", - "mov w22, v22.s[0]", + "ldr q8, [x29, x20, sxtx]", + "mov w20, v4.s[2]", + "mov w21, v4.s[3]", + "mov w22, v5.s[0]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w20, w20, #10", "eor w20, w23, w20", "add w20, w22, w20", - "mov w22, v22.s[1]", + "mov w22, v5.s[1]", "ror w23, w21, #17", "ror w24, w21, #19", "eor w23, w23, w24", "lsr w21, w21, #10", "eor w21, w23, w21", "add w21, w22, w21", - "mov w22, v22.s[2]", + "mov w22, v5.s[2]", "ror w23, w20, #17", "ror w24, w20, #19", "eor w23, w23, w24", "lsr w24, w20, #10", "eor w23, w23, w24", "add w22, w22, w23", - "mov w23, v22.s[3]", + "mov w23, v5.s[3]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov v5.16b, v22.16b", "mov v5.s[3], w23", "mov v5.s[2], w22", "mov v5.s[1], w21", - "mov v22.16b, v5.16b", - "mov v22.s[0], w20", - "add v16.4s, v16.4s, v21.4s", + "mov v5.s[0], w20", + "add v8.4s, v8.4s, v3.4s", "unimplemented (Unimplemented)", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -1967,13 +1892,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v8.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -1983,7 +1908,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -1992,9 +1917,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v8.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2005,18 +1930,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v9.s[3], w21", + "mov v9.s[2], w25", + "mov v9.s[1], w20", + "mov v9.s[0], w22", + "tbl v8.16b, {v8.16b}, v10.16b", + "mov w20, v9.s[1]", + "mov w21, v9.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2024,13 +1947,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v8.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v9.s[3]", + "mov w24, v9.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2040,7 +1963,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2049,9 +1972,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v8.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2062,66 +1985,61 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v22.16b", - "ext v16.16b, v23.16b, v22.16b, #4", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "ext v8.16b, v4.16b, v5.16b, #4", "mov w20, #0x1000", "movk w20, #0x1, lsl #16", - "ldr q5, [x29, x20, sxtx]", - "add v23.4s, v23.4s, v5.4s", - "add v21.4s, v21.4s, v16.4s", - "ldr q16, [x29, x20, sxtx]", - "mov w21, v22.s[2]", - "mov w22, v22.s[3]", - "mov w23, v21.s[0]", + "ldr q11, [x29, x20, sxtx]", + "add v23.4s, v4.4s, v11.4s", + "add v3.4s, v3.4s, v8.4s", + "ldr q4, [x29, x20, sxtx]", + "mov w21, v5.s[2]", + "mov w22, v5.s[3]", + "mov w23, v3.s[0]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w21, w21, #10", "eor w21, w24, w21", "add w21, w23, w21", - "mov w23, v21.s[1]", + "mov w23, v3.s[1]", "ror w24, w22, #17", "ror w25, w22, #19", "eor w24, w24, w25", "lsr w22, w22, #10", "eor w22, w24, w22", "add w22, w23, w22", - "mov w23, v21.s[2]", + "mov w23, v3.s[2]", "ror w24, w21, #17", "ror w25, w21, #19", "eor w24, w24, w25", "lsr w25, w21, #10", "eor w24, w24, w25", "add w23, w23, w24", - "mov w24, v21.s[3]", + "mov w24, v3.s[3]", "ror w25, w22, #17", "ror w30, w22, #19", "eor w25, w25, w30", "lsr w30, w22, #10", "eor w25, w25, w30", "add w24, w24, w25", - "mov v5.16b, v21.16b", - "mov v5.s[3], w24", - "mov v5.s[2], w23", - "mov v5.s[1], w22", - "mov v21.16b, v5.16b", - "mov v21.s[0], w21", + "mov v3.s[3], w24", + "mov v3.s[2], w23", + "mov v3.s[1], w22", + "mov v3.s[0], w21", + "ldr q8, [x29, x20, sxtx]", + "add v22.4s, v5.4s, v8.4s", "ldr q5, [x29, x20, sxtx]", - "add v22.4s, v22.4s, v5.4s", - "ldr q5, [x29, x20, sxtx]", - "add v21.4s, v21.4s, v5.4s", - "add v16.4s, v16.4s, v24.4s", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "add v21.4s, v3.4s, v5.4s", + "add v3.4s, v4.4s, v24.4s", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v9.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2129,13 +2047,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v3.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v9.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v9.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2145,7 +2063,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v9.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2154,9 +2072,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v3.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v9.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2167,18 +2085,17 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v9.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v16.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v4.16b, v9.16b", + "mov v4.s[3], w21", + "mov v4.s[2], w25", + "mov v4.s[1], w20", + "mov v4.s[0], w22", + "tbl v3.16b, {v3.16b}, v10.16b", + "mov w20, v4.s[1]", + "mov w21, v4.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2186,13 +2103,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v3.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v4.s[3]", + "mov w24, v4.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2202,7 +2119,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2211,9 +2128,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v3.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2224,18 +2141,15 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v23.16b", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v4.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2245,11 +2159,11 @@ "add w22, w22, w23", "mov w23, v23.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v4.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2259,7 +2173,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v4.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2270,7 +2184,7 @@ "add w20, w20, w21", "mov w21, v23.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2281,18 +2195,17 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v4.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v23.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v3.16b, v4.16b", + "mov v3.s[3], w21", + "mov v3.s[2], w25", + "mov v3.s[1], w20", + "mov v3.s[0], w22", + "tbl v4.16b, {v23.16b}, v10.16b", + "mov w20, v3.s[1]", + "mov w21, v3.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2300,13 +2213,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v3.s[3]", + "mov w24, v3.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2316,7 +2229,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2325,9 +2238,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2338,18 +2251,15 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v22.16b", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v3.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2359,11 +2269,11 @@ "add w22, w22, w23", "mov w23, v22.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v3.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v3.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2373,7 +2283,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v3.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2384,7 +2294,7 @@ "add w20, w20, w21", "mov w21, v22.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v3.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2395,18 +2305,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v3.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v22.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v3.s[3], w21", + "mov v3.s[2], w25", + "mov v3.s[1], w20", + "mov v3.s[0], w22", + "tbl v4.16b, {v22.16b}, v10.16b", + "mov w20, v3.s[1]", + "mov w21, v3.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2414,13 +2322,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v3.s[3]", + "mov w24, v3.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2430,7 +2338,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2439,9 +2347,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2452,18 +2360,15 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v5.16b, v17.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v17.16b, v5.16b", - "mov v17.s[0], w22", - "mov v16.16b, v21.16b", - "mov w20, v17.s[1]", - "mov w21, v17.s[0]", - "mov w22, v20.s[1]", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "mov w20, v2.s[1]", + "mov w21, v2.s[0]", + "mov w22, v3.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2473,11 +2378,11 @@ "add w22, w22, w23", "mov w23, v21.s[0]", "add w22, w22, w23", - "mov w23, v20.s[0]", + "mov w23, v3.s[0]", "add w22, w22, w23", - "mov w23, v17.s[3]", - "mov w24, v17.s[2]", - "mov w25, v20.s[3]", + "mov w23, v2.s[3]", + "mov w24, v2.s[2]", + "mov w25, v3.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2487,7 +2392,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v20.s[2]", + "mov w30, v3.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2498,7 +2403,7 @@ "add w20, w20, w21", "mov w21, v21.s[1]", "add w20, w20, w21", - "mov w21, v20.s[1]", + "mov w21, v3.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2509,18 +2414,16 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v20.s[3]", + "mov w23, v3.s[3]", "add w20, w20, w23", - "mov v5.16b, v20.16b", - "mov v5.s[3], w21", - "mov v5.s[2], w25", - "mov v5.s[1], w20", - "mov v20.16b, v5.16b", - "mov v20.s[0], w22", - "tbl v16.16b, {v21.16b}, v4.16b", - "mov w20, v20.s[1]", - "mov w21, v20.s[0]", - "mov w22, v17.s[1]", + "mov v3.s[3], w21", + "mov v3.s[2], w25", + "mov v3.s[1], w20", + "mov v3.s[0], w22", + "tbl v4.16b, {v21.16b}, v10.16b", + "mov w20, v3.s[1]", + "mov w21, v3.s[0]", + "mov w22, v2.s[1]", "and w23, w20, w21", "bic w22, w22, w20", "eor w22, w23, w22", @@ -2528,13 +2431,13 @@ "eor w23, w23, w20, ror #11", "eor w23, w23, w20, ror #25", "add w22, w22, w23", - "mov w23, v16.s[0]", + "mov w23, v4.s[0]", "add w22, w22, w23", - "mov w23, v17.s[0]", + "mov w23, v2.s[0]", "add w22, w22, w23", - "mov w23, v20.s[3]", - "mov w24, v20.s[2]", - "mov w25, v17.s[3]", + "mov w23, v3.s[3]", + "mov w24, v3.s[2]", + "mov w25, v2.s[3]", "and w30, w24, w25", "orr w25, w24, w25", "and w25, w23, w25", @@ -2544,7 +2447,7 @@ "eor w30, w30, w23, ror #13", "eor w30, w30, w23, ror #22", "add w25, w25, w30", - "mov w30, v17.s[2]", + "mov w30, v2.s[2]", "add w22, w22, w30", "and w20, w22, w20", "bic w21, w21, w22", @@ -2553,9 +2456,9 @@ "eor w21, w21, w22, ror #11", "eor w21, w21, w22, ror #25", "add w20, w20, w21", - "mov w21, v16.s[1]", + "mov w21, v4.s[1]", "add w20, w20, w21", - "mov w21, v17.s[1]", + "mov w21, v2.s[1]", "add w20, w20, w21", "and w21, w23, w24", "orr w23, w23, w24", @@ -2566,22 +2469,19 @@ "eor w23, w23, w25, ror #13", "eor w23, w23, w25, ror #22", "add w21, w21, w23", - "mov w23, v17.s[3]", + "mov w23, v2.s[3]", "add w20, w20, w23", - "mov v4.16b, v17.16b", - "mov v4.s[3], w21", - "mov v4.s[2], w25", - "mov v4.s[1], w20", - "mov v17.16b, v4.16b", - "mov v17.s[0], w22", - "add v20.4s, v20.4s, v18.4s", - "add v17.4s, v17.4s, v19.4s", - "tbl v20.16b, {v20.16b}, v2.16b", - "tbl v17.16b, {v17.16b}, v3.16b", - "mov v16.16b, v17.16b", + "mov v2.s[3], w21", + "mov v2.s[2], w25", + "mov v2.s[1], w20", + "mov v2.s[0], w22", + "add v3.4s, v3.4s, v18.4s", + "add v2.4s, v2.4s, v19.4s", + "tbl v3.16b, {v3.16b}, v6.16b", + "tbl v17.16b, {v2.16b}, v7.16b", "mov v16.16b, v17.16b", - "mov v16.d[1], v20.d[1]", - "ext v20.16b, v17.16b, v20.16b, #8", + "mov v16.d[1], v3.d[1]", + "ext v20.16b, v17.16b, v3.16b, #8", "str q16, [x11, #256]", "str q20, [x11, #272]" ] diff --git a/unittests/InstructionCountCI/FlagM/FlagOpts.json b/unittests/InstructionCountCI/FlagM/FlagOpts.json index fed3fa264f..a1e03dab25 100644 --- a/unittests/InstructionCountCI/FlagM/FlagOpts.json +++ b/unittests/InstructionCountCI/FlagM/FlagOpts.json @@ -86,16 +86,15 @@ ] }, "INC consumed": { - "ExpectedInstructionCount": 6, + "ExpectedInstructionCount": 5, "x86Insts": [ "add rax, rbx", "inc rax" ], "ExpectedArm64ASM": [ - "adds x4, x4, x7", + "adds x27, x4, x7", "cset w20, hs", - "mov x27, x4", - "adds x26, x4, #0x1 (1)", + "adds x26, x27, #0x1 (1)", "rmif x20, #63, #nzCv", "mov x4, x26" ] @@ -108,23 +107,22 @@ "test rax, rdx" ], "ExpectedArm64ASM": [ - "add x4, x4, x7", - "add x4, x4, #0x1 (1)", + "add x20, x4, x7", + "add x4, x20, #0x1 (1)", "ands x26, x4, x6" ] }, "DEC consumed": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 6, "x86Insts": [ "sub rax, rbx", "dec rax" ], "ExpectedArm64ASM": [ - "subs x4, x4, x7", + "subs x27, x4, x7", "cfinv", "cset w20, hs", - "mov x27, x4", - "subs x26, x4, #0x1 (1)", + "subs x26, x27, #0x1 (1)", "rmif x20, #63, #nzCv", "mov x4, x26" ] @@ -137,13 +135,13 @@ "test rax, rcx" ], "ExpectedArm64ASM": [ - "sub x4, x4, x7", - "sub x4, x4, #0x1 (1)", + "sub x20, x4, x7", + "sub x4, x20, #0x1 (1)", "ands x26, x4, x5" ] }, "8-bit DEC consumed": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 15, "x86Insts": [ "sub al, ah", "dec al" @@ -154,17 +152,20 @@ "cmp w0, w20, lsl #24", "sub w20, w4, w20", "cfinv", - "bfxil x4, x20, #0, #8", - "uxtb w27, w4", + "mov x0, x4", + "bfxil x0, x20, #0, #8", + "mov x20, x0", + "uxtb w27, w20", "sub w26, w27, #0x1 (1)", "setf8 w26", - "bic w20, w27, w26", - "rmif x20, #7, #nzcV", + "bic w21, w27, w26", + "rmif x21, #7, #nzcV", + "mov x4, x20", "bfxil x4, x26, #0, #8" ] }, "8-bit DEC dead": { - "ExpectedInstructionCount": 8, + "ExpectedInstructionCount": 11, "x86Insts": [ "sub al, ah", "dec al", @@ -173,10 +174,13 @@ "ExpectedArm64ASM": [ "lsr w20, w4, #8", "sub w20, w4, w20", - "bfxil x4, x20, #0, #8", - "uxtb w20, w4", - "sub w20, w20, #0x1 (1)", - "bfxil x4, x20, #0, #8", + "mov x0, x4", + "bfxil x0, x20, #0, #8", + "mov x20, x0", + "uxtb w21, w20", + "sub w21, w21, #0x1 (1)", + "mov x4, x20", + "bfxil x4, x21, #0, #8", "mov x26, x4", "cmn wzr, w26, lsl #24" ] diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks.json b/unittests/InstructionCountCI/FlagM/HotBlocks.json index 9838b0a9d9..eba79fdea0 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks.json @@ -13,7 +13,7 @@ }, "Instructions": { "The Witcher 3": { - "ExpectedInstructionCount": 9, + "ExpectedInstructionCount": 8, "x86Insts": [ "mov eax, 0x1", "lock xadd qword [rcx], rax", @@ -24,19 +24,18 @@ "add rdx, rcx" ], "ExpectedArm64ASM": [ - "mov w4, #0x1", - "ldaddal x4, x4, [x5]", - "mov x6, x4", - "and w6, w4, #0x1f", - "add x6, x6, #0x1 (1)", - "lsl x6, x6, #6", - "eor w27, w6, w5", - "adds x26, x6, x5", + "mov w20, #0x1", + "ldaddal x20, x4, [x5]", + "and w20, w4, #0x1f", + "add x20, x20, #0x1 (1)", + "lsl x20, x20, #6", + "eor w27, w20, w5", + "adds x26, x20, x5", "mov x6, x26" ] }, "FMOD scalar loop": { - "ExpectedInstructionCount": 88, + "ExpectedInstructionCount": 86, "x86Insts": [ "mov esi, ecx", "mov rdx, rbp", @@ -78,92 +77,90 @@ "sub esi, 0x1" ], "ExpectedArm64ASM": [ - "mov w10, w5", - "mov x6, x9", - "mov x4, x7", - "ldr s18, [x6]", - "add x4, x4, #0x20 (32)", - "fmul s0, s18, s16", - "mov v18.s[0], v0.s[0]", - "add x6, x6, #0x20 (32)", + "mov w27, w5", + "ldr s2, [x9]", + "add x4, x7, #0x20 (32)", + "fmul s0, s2, s16", + "mov v2.s[0], v0.s[0]", + "add x6, x9, #0x20 (32)", "sub x20, x4, #0x20 (32)", - "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x20 (32)", - "str s18, [x20]", + "str s2, [x20]", "sub x20, x6, #0x1c (28)", - "ldr s18, [x20]", - "fmul s0, s18, s17", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x1c (28)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s17", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x1c (28)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x1c (28)", + "str s2, [x20]", "sub x20, x6, #0x18 (24)", - "ldr s18, [x20]", - "fmul s0, s18, s16", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x18 (24)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s16", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x18 (24)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x18 (24)", + "str s2, [x20]", "sub x20, x6, #0x14 (20)", - "ldr s18, [x20]", - "fmul s0, s18, s17", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x14 (20)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s17", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x14 (20)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x14 (20)", + "str s2, [x20]", "sub x20, x6, #0x10 (16)", - "ldr s18, [x20]", - "fmul s0, s18, s16", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x10 (16)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s16", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x10 (16)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x10 (16)", + "str s2, [x20]", "sub x20, x6, #0xc (12)", - "ldr s18, [x20]", - "fmul s0, s18, s17", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0xc (12)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s17", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0xc (12)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0xc (12)", + "str s2, [x20]", "sub x20, x6, #0x8 (8)", - "ldr s18, [x20]", - "fmul s0, s18, s16", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x8 (8)", "ldr s2, [x20]", - "fadd s0, s18, s2", - "mov v18.s[0], v0.s[0]", + "fmul s0, s2, s16", + "mov v2.s[0], v0.s[0]", "sub x20, x4, #0x8 (8)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s0, s2, s3", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x8 (8)", + "str s2, [x20]", "sub x20, x6, #0x4 (4)", - "ldr s18, [x20]", - "fmul s0, s18, s17", - "mov v18.s[0], v0.s[0]", - "sub x20, x4, #0x4 (4)", "ldr s2, [x20]", - "fadd s0, s18, s2", + "fmul s0, s2, s17", + "mov v2.s[0], v0.s[0]", + "sub x20, x4, #0x4 (4)", + "ldr s3, [x20]", + "mov v18.16b, v2.16b", + "fadd s0, s2, s3", "mov v18.s[0], v0.s[0]", "sub x20, x4, #0x4 (4)", "str s18, [x20]", - "mov x27, x10", - "subs w26, w10, #0x1 (1)", + "subs w26, w27, #0x1 (1)", "cfinv", "mov x10, x26" ] @@ -181,8 +178,8 @@ "cmp rsi, rax" ], "ExpectedArm64ASM": [ - "ldr q16, [x16, x4, sxtx]", - "add v16.2d, v16.2d, v17.2d", + "ldr q2, [x16, x4, sxtx]", + "add v16.2d, v2.2d, v17.2d", "str q16, [x16, x4, sxtx]", "add x4, x4, #0x10 (16)", "eor w27, w10, w4", @@ -191,7 +188,7 @@ ] }, "bytemark data xor loop": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 13, "Comment": [ "Saw this in bytemark" ], @@ -208,13 +205,10 @@ ], "ExpectedArm64ASM": [ "mov x20, x4", - "mov x6, x20", - "mov x5, x20", - "mov x19, x10", "add x4, x20, #0x1 (1)", "lsr x6, x20, #6", "and w5, w20, #0x3f", - "lsl x19, x19, x5", + "lsl x19, x10, x5", "add x20, x7, x6, lsl #3", "ldr x20, [x20]", "eor x20, x20, x19", diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json b/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json index 499d03682b..75cba2d486 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json @@ -147,7 +147,7 @@ ] }, "Psychonauts matrix swizzle": { - "ExpectedInstructionCount": 2426, + "ExpectedInstructionCount": 2364, "Comment": [ "Hottest block in Windows Psychonauts", "Doing a 4x4 32-bit float matrix swizzle", @@ -343,10 +343,8 @@ "ldrb w23, [x28, #1026]", "lsl w24, w22, w20", "bic w23, w23, w24", - "strb w23, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, w21, sxtw]", "ldr s2, [x5, #16]", "mrs x0, nzcv", @@ -418,10 +416,8 @@ "ldrb w23, [x28, #1026]", "lsl w24, w22, w20", "bic w23, w23, w24", - "strb w23, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, w21, sxtw]", "ldr s2, [x6, #32]", "mrs x0, nzcv", @@ -493,10 +489,8 @@ "ldrb w23, [x28, #1026]", "lsl w24, w22, w20", "bic w23, w23, w24", - "strb w23, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, w21, sxtw]", "ldr s2, [x4, #48]", "mrs x0, nzcv", @@ -568,10 +562,8 @@ "ldrb w23, [x28, #1026]", "lsl w24, w22, w20", "bic w23, w23, w24", - "strb w23, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, w21, sxtw]", "ldr s2, [x5, #4]", "mrs x0, nzcv", @@ -643,10 +635,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w22, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "mov w22, #0xffffffbc", "ldr w6, [x9, w22, sxtw]", "ldr s2, [x6, #20]", @@ -720,10 +710,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, w22, sxtw]", "ldr s2, [x4, #36]", "mrs x0, nzcv", @@ -795,10 +783,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, w22, sxtw]", "ldr s2, [x5, #52]", "mrs x0, nzcv", @@ -870,10 +856,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, w22, sxtw]", "ldr s2, [x6, #8]", "mrs x0, nzcv", @@ -945,10 +929,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, w22, sxtw]", "ldr s2, [x4, #24]", "mrs x0, nzcv", @@ -1020,10 +1002,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w23, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "mov w22, #0xffffffbc", "ldr w5, [x9, w22, sxtw]", "ldr s2, [x5, #40]", @@ -1097,10 +1077,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, w22, sxtw]", "ldr s2, [x6, #56]", "mrs x0, nzcv", @@ -1172,10 +1150,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, w22, sxtw]", "ldr s2, [x4, #12]", "mrs x0, nzcv", @@ -1247,10 +1223,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, w22, sxtw]", "ldr s2, [x5, #28]", "mrs x0, nzcv", @@ -1322,10 +1296,8 @@ "ldrb w21, [x28, #1026]", "lsl w24, w23, w20", "bic w21, w21, w24", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, w22, sxtw]", "ldr s2, [x6, #44]", "mrs x0, nzcv", @@ -1397,10 +1369,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w23, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "mov w22, #0xffffffbc", "ldr w4, [x9, w22, sxtw]", "ldr s2, [x4, #60]", @@ -1474,10 +1444,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w23, #0xffffffc0", "ldr s2, [x9, w23, sxtw]", @@ -1549,10 +1517,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, #8]", "mov w23, #0xffffffc4", "ldr s2, [x9, w23, sxtw]", @@ -1624,10 +1590,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, #8]", "mov w23, #0xffffffc8", "ldr s2, [x9, w23, sxtw]", @@ -1699,10 +1663,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w23, #0xffffffcc", "ldr s2, [x9, w23, sxtw]", @@ -1774,10 +1736,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w22, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, #8]", "mov w22, #0xffffffd0", "ldr s2, [x9, w22, sxtw]", @@ -1850,10 +1810,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, #8]", "mov w23, #0xffffffd4", "ldr s2, [x9, w23, sxtw]", @@ -1925,10 +1883,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w23, #0xffffffd8", "ldr s2, [x9, w23, sxtw]", @@ -2000,10 +1956,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, #8]", "mov w23, #0xffffffdc", "ldr s2, [x9, w23, sxtw]", @@ -2075,10 +2029,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, #8]", "mov w23, #0xffffffe0", "ldr s2, [x9, w23, sxtw]", @@ -2150,10 +2102,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w22, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w22, #0xffffffe4", "ldr s2, [x9, w22, sxtw]", @@ -2226,10 +2176,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, #8]", "mov w23, #0xffffffe8", "ldr s2, [x9, w23, sxtw]", @@ -2301,10 +2249,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, #8]", "mov w23, #0xffffffec", "ldr s2, [x9, w23, sxtw]", @@ -2376,10 +2322,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w23, #0xfffffff0", "ldr s2, [x9, w23, sxtw]", @@ -2451,10 +2395,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w6, [x9, #8]", "mov w23, #0xfffffff4", "ldr s2, [x9, w23, sxtw]", @@ -2526,10 +2468,8 @@ "ldrb w21, [x28, #1026]", "lsl w22, w22, w20", "bic w21, w21, w22", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w4, [x9, #8]", "mov w22, #0xfffffff8", "ldr s2, [x9, w22, sxtw]", @@ -2602,10 +2542,8 @@ "ldrb w21, [x28, #1026]", "lsl w23, w22, w20", "bic w21, w21, w23", - "strb w21, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", - "strb w20, [x28, #747]", "ldr w5, [x9, #8]", "mov w23, #0xfffffffc", "ldr s2, [x9, w23, sxtw]", diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks_AFP.json b/unittests/InstructionCountCI/FlagM/HotBlocks_AFP.json index f1871c91a3..9c196355c8 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks_AFP.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks_AFP.json @@ -13,7 +13,7 @@ }, "Instructions": { "FMOD scalar loop": { - "ExpectedInstructionCount": 72, + "ExpectedInstructionCount": 70, "x86Insts": [ "mov esi, ecx", "mov rdx, rbp", @@ -55,76 +55,74 @@ "sub esi, 0x1" ], "ExpectedArm64ASM": [ - "mov w10, w5", - "mov x6, x9", - "mov x4, x7", - "ldr s18, [x6]", - "add x4, x4, #0x20 (32)", - "fmul s18, s18, s16", - "add x6, x6, #0x20 (32)", + "mov w27, w5", + "ldr s2, [x9]", + "add x4, x7, #0x20 (32)", + "fmul s2, s2, s16", + "add x6, x9, #0x20 (32)", "sub x20, x4, #0x20 (32)", - "ldr s2, [x20]", - "fadd s18, s18, s2", + "ldr s3, [x20]", + "fadd s2, s2, s3", "sub x20, x4, #0x20 (32)", - "str s18, [x20]", + "str s2, [x20]", "sub x20, x6, #0x1c (28)", - "ldr s18, [x20]", - "fmul s18, s18, s17", - "sub x20, x4, #0x1c (28)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s17", "sub x20, x4, #0x1c (28)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0x1c (28)", + "str s2, [x20]", "sub x20, x6, #0x18 (24)", - "ldr s18, [x20]", - "fmul s18, s18, s16", - "sub x20, x4, #0x18 (24)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s16", "sub x20, x4, #0x18 (24)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0x18 (24)", + "str s2, [x20]", "sub x20, x6, #0x14 (20)", - "ldr s18, [x20]", - "fmul s18, s18, s17", - "sub x20, x4, #0x14 (20)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s17", "sub x20, x4, #0x14 (20)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0x14 (20)", + "str s2, [x20]", "sub x20, x6, #0x10 (16)", - "ldr s18, [x20]", - "fmul s18, s18, s16", - "sub x20, x4, #0x10 (16)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s16", "sub x20, x4, #0x10 (16)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0x10 (16)", + "str s2, [x20]", "sub x20, x6, #0xc (12)", - "ldr s18, [x20]", - "fmul s18, s18, s17", - "sub x20, x4, #0xc (12)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s17", "sub x20, x4, #0xc (12)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0xc (12)", + "str s2, [x20]", "sub x20, x6, #0x8 (8)", - "ldr s18, [x20]", - "fmul s18, s18, s16", - "sub x20, x4, #0x8 (8)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s16", "sub x20, x4, #0x8 (8)", - "str s18, [x20]", + "ldr s3, [x20]", + "fadd s2, s2, s3", + "sub x20, x4, #0x8 (8)", + "str s2, [x20]", "sub x20, x6, #0x4 (4)", - "ldr s18, [x20]", - "fmul s18, s18, s17", - "sub x20, x4, #0x4 (4)", "ldr s2, [x20]", - "fadd s18, s18, s2", + "fmul s2, s2, s17", + "sub x20, x4, #0x4 (4)", + "ldr s3, [x20]", + "mov v18.16b, v2.16b", + "fadd s18, s2, s3", "sub x20, x4, #0x4 (4)", "str s18, [x20]", - "mov x27, x10", - "subs w26, w10, #0x1 (1)", + "subs w26, w27, #0x1 (1)", "cfinv", "mov x10, x26" ] diff --git a/unittests/InstructionCountCI/FlagM/x87.json b/unittests/InstructionCountCI/FlagM/x87.json index 010aaefe27..5bcbafa0b0 100644 --- a/unittests/InstructionCountCI/FlagM/x87.json +++ b/unittests/InstructionCountCI/FlagM/x87.json @@ -6757,7 +6757,7 @@ ] }, "fucompp": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 57, "Comment": [ "0xda 11b 0xe9 /5" ], @@ -6811,7 +6811,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", @@ -14681,7 +14680,7 @@ ] }, "fcompp": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 57, "Comment": [ "0xde 11b 0xd9 /3" ], @@ -14735,7 +14734,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", diff --git a/unittests/InstructionCountCI/FlagM/x87_f64.json b/unittests/InstructionCountCI/FlagM/x87_f64.json index 725fff198f..f0257b6dad 100644 --- a/unittests/InstructionCountCI/FlagM/x87_f64.json +++ b/unittests/InstructionCountCI/FlagM/x87_f64.json @@ -4071,7 +4071,7 @@ ] }, "fucompp": { - "ExpectedInstructionCount": 30, + "ExpectedInstructionCount": 29, "Comment": [ "0xda 11b 0xe9 /5" ], @@ -4097,7 +4097,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", @@ -6404,7 +6403,7 @@ ] }, "frstor [rax]": { - "ExpectedInstructionCount": 325, + "ExpectedInstructionCount": 324, "Comment": [ "0xdd !11b /4" ], @@ -6420,7 +6419,6 @@ "bfi x0, x1, #24, #1", "msr fpcr, x0", "strh w20, [x28, #1024]", - "strh w20, [x28, #1024]", "ldr w20, [x4, #4]", "ubfx w21, w20, #11, #3", "strb w21, [x28, #747]", @@ -8570,7 +8568,7 @@ ] }, "fcompp": { - "ExpectedInstructionCount": 30, + "ExpectedInstructionCount": 29, "Comment": [ "0xde 11b 0xd9 /3" ], @@ -8596,7 +8594,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", diff --git a/unittests/InstructionCountCI/x87.json b/unittests/InstructionCountCI/x87.json index fd3f132516..c6f1e8f31b 100644 --- a/unittests/InstructionCountCI/x87.json +++ b/unittests/InstructionCountCI/x87.json @@ -6756,7 +6756,7 @@ ] }, "fucompp": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 57, "Comment": [ "0xda 11b 0xe9 /5" ], @@ -6810,7 +6810,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", @@ -14696,7 +14695,7 @@ ] }, "fcompp": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 57, "Comment": [ "0xde 11b 0xd9 /3" ], @@ -14750,7 +14749,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", diff --git a/unittests/InstructionCountCI/x87_f64.json b/unittests/InstructionCountCI/x87_f64.json index c2b7bb07a0..ea97056ab9 100644 --- a/unittests/InstructionCountCI/x87_f64.json +++ b/unittests/InstructionCountCI/x87_f64.json @@ -4091,7 +4091,7 @@ ] }, "fucompp": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 30, "Comment": [ "0xda 11b 0xe9 /5" ], @@ -4118,7 +4118,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20", @@ -6523,7 +6522,7 @@ ] }, "frstor [rax]": { - "ExpectedInstructionCount": 325, + "ExpectedInstructionCount": 324, "Comment": [ "0xdd !11b /4" ], @@ -6539,7 +6538,6 @@ "bfi x0, x1, #24, #1", "msr fpcr, x0", "strh w20, [x28, #1024]", - "strh w20, [x28, #1024]", "ldr w20, [x4, #4]", "ubfx w21, w20, #11, #3", "strb w21, [x28, #747]", @@ -8707,7 +8705,7 @@ ] }, "fcompp": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 30, "Comment": [ "0xde 11b 0xd9 /3" ], @@ -8734,7 +8732,6 @@ "ldrb w22, [x28, #1026]", "lsl w23, w21, w20", "bic w22, w22, w23", - "strb w22, [x28, #1026]", "add w20, w20, #0x1 (1)", "and w20, w20, #0x7", "lsl w21, w21, w20",