diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst.json b/unittests/InstructionCountCI/FEXOpt/MultiInst.json index d9da698cd6..ec00b27055 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst.json @@ -210,7 +210,7 @@ ] }, "positive rep movsb": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -225,6 +225,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -241,7 +264,7 @@ ] }, "positive rep movsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -256,6 +279,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -272,7 +318,7 @@ ] }, "positive rep movsd": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -287,6 +333,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -303,7 +372,7 @@ ] }, "positive rep movsq": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -318,6 +387,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -334,7 +426,7 @@ ] }, "negative rep movsb": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -349,6 +441,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", @@ -365,7 +480,7 @@ ] }, "negative rep movsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -380,6 +495,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", @@ -396,7 +534,7 @@ ] }, "negative rep movsd": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -411,6 +549,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #-4", "str w3, [x1], #-4", @@ -427,7 +588,7 @@ ] }, "negative rep movsq": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 41, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -442,6 +603,29 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #-8", "str x3, [x1], #-8", @@ -458,7 +642,7 @@ ] }, "positive rep stosb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -473,6 +657,26 @@ "uxtb w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.16b, w21", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w21, [x1], #1", "sub x0, x0, #0x1 (1)", @@ -482,7 +686,7 @@ ] }, "positive rep stosw": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -497,6 +701,26 @@ "uxth w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.8h, w21", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w21, [x1], #2", "sub x0, x0, #0x1 (1)", @@ -506,7 +730,7 @@ ] }, "positive rep stosd": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -521,6 +745,26 @@ "mov w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.4s, w21", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w21, [x1], #4", "sub x0, x0, #0x1 (1)", @@ -530,7 +774,7 @@ ] }, "positive rep stosq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -544,6 +788,26 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", @@ -553,7 +817,7 @@ ] }, "negative rep stosb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -568,6 +832,26 @@ "uxtb w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -577,7 +861,7 @@ ] }, "negative rep stosw": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -592,6 +876,26 @@ "uxth w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -601,7 +905,7 @@ ] }, "negative rep stosd": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -616,6 +920,26 @@ "mov w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -625,7 +949,7 @@ ] }, "negative rep stosq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -639,6 +963,26 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)", diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index a7f8fad12c..dd05c645a9 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -2861,14 +2861,37 @@ ] }, "rep movsb": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 75, "Comment": "0xa4", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -2879,7 +2902,30 @@ "mov x2, x5", "add x20, x0, x2", "add x21, x1, x2", - "b #+0x2c", + "b #+0x88", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x34", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", @@ -2896,14 +2942,37 @@ ] }, "rep movsw": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 75, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -2914,7 +2983,30 @@ "mov x2, x5", "add x20, x0, x2, lsl #1", "add x21, x1, x2, lsl #1", - "b #+0x2c", + "b #+0x88", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x34", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", @@ -2931,14 +3023,37 @@ ] }, "rep movsd": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 75, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -2949,7 +3064,30 @@ "mov x2, x5", "add x20, x0, x2, lsl #2", "add x21, x1, x2, lsl #2", - "b #+0x2c", + "b #+0x88", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x34", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #-4", "str w3, [x1], #-4", @@ -2966,14 +3104,37 @@ ] }, "rep movsq": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 75, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x8c", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -2984,7 +3145,30 @@ "mov x2, x5", "add x20, x0, x2, lsl #3", "add x21, x1, x2, lsl #3", - "b #+0x2c", + "b #+0x88", + "cbz x0, #+0x70", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x54", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x44", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x1c", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x14", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x34", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #-8", "str x3, [x1], #-8", @@ -3368,20 +3552,60 @@ ] }, "rep stosb": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 57, "Comment": "0xaa", "ExpectedArm64ASM": [ "uxtb w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #1", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5", - "b #+0x18", + "b #+0x68", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.16b, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x40 (64)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -3391,20 +3615,60 @@ ] }, "rep stosw": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 57, "Comment": "0xab", "ExpectedArm64ASM": [ "uxth w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #2", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #1", - "b #+0x18", + "b #+0x68", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.8h, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x20 (32)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -3414,20 +3678,60 @@ ] }, "rep stosd": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 57, "Comment": "0xab", "ExpectedArm64ASM": [ "mov w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #4", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #2", - "b #+0x18", + "b #+0x68", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.4s, w20", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x10 (16)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -3437,7 +3741,7 @@ ] }, "rep stosq": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 56, "Comment": [ "Unrolling the loop for faster memset can be done.", "Taking advantage of ARM MOPs instructions can be done", @@ -3447,13 +3751,53 @@ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x20, #+0x1c", + "cbnz x20, #+0x6c", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #3", - "b #+0x18", + "b #+0x68", + "cbz x0, #+0x60", + "and x3, x1, #0x3", + "cbnz x3, #+0x4c", + "dup v1.2d, x4", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x38", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x14", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x8 (8)", + "tbz x0, #63, #-0xc", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x2c", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "sub x0, x0, #0x4 (4)", + "tbz x0, #63, #-0x8", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)",