Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes CMake configuration on AArch64 host #16

Merged
merged 1 commit into from
Mar 13, 2020

Conversation

Sonicadvance1
Copy link
Member

No description provided.

@Sonicadvance1 Sonicadvance1 merged commit 7cb97ed into FEX-Emu:master Mar 13, 2020
@Sonicadvance1 Sonicadvance1 deleted the fixes_cmake_aarch64 branch March 13, 2020 20:04
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 7, 2023
Only return the particular flags that are being requested in the moment
since compacting them all when requested is fairly slow.

x87 fcmov in particular was requesting all the flags when it only needs
a couple.
This reduces a `fcmovb` instruction count blowup from 103x to 48x. Still
more room to go but this one stood out as being particularly bad.

Old:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b0397    ldrb w23, [x28, FEX-Emu#704]
0x0000000265a002d4  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002d8  aa1702d6    orr x22, x22, x23
0x0000000265a002dc  394b0b97    ldrb w23, [x28, FEX-Emu#706]
0x0000000265a002e0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002e4  531e76f7    lsl w23, w23, FEX-Emu#2
0x0000000265a002e8  aa1702d6    orr x22, x22, x23
0x0000000265a002ec  394b1397    ldrb w23, [x28, FEX-Emu#708]
0x0000000265a002f0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002f4  531c6ef7    lsl w23, w23, FEX-Emu#4
0x0000000265a002f8  aa1702d6    orr x22, x22, x23
0x0000000265a002fc  394b1b97    ldrb w23, [x28, FEX-Emu#710]
0x0000000265a00300  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00304  531a66f7    lsl w23, w23, FEX-Emu#6
0x0000000265a00308  aa1702d6    orr x22, x22, x23
0x0000000265a0030c  394b1f97    ldrb w23, [x28, FEX-Emu#711]
0x0000000265a00310  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00314  531962f7    lsl w23, w23, FEX-Emu#7
0x0000000265a00318  aa1702d6    orr x22, x22, x23
0x0000000265a0031c  394b2397    ldrb w23, [x28, FEX-Emu#712]
0x0000000265a00320  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00324  53185ef7    lsl w23, w23, FEX-Emu#8
0x0000000265a00328  aa1702d6    orr x22, x22, x23
0x0000000265a0032c  394b2797    ldrb w23, [x28, FEX-Emu#713]
0x0000000265a00330  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00334  53175af7    lsl w23, w23, FEX-Emu#9
0x0000000265a00338  aa1702d6    orr x22, x22, x23
0x0000000265a0033c  394b2b97    ldrb w23, [x28, FEX-Emu#714]
0x0000000265a00340  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00344  531656f7    lsl w23, w23, FEX-Emu#10
0x0000000265a00348  aa1702d6    orr x22, x22, x23
0x0000000265a0034c  394b2f97    ldrb w23, [x28, FEX-Emu#715]
0x0000000265a00350  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00354  531552f7    lsl w23, w23, FEX-Emu#11
0x0000000265a00358  aa1702d6    orr x22, x22, x23
0x0000000265a0035c  394b3397    ldrb w23, [x28, FEX-Emu#716]
0x0000000265a00360  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00364  53144ef7    lsl w23, w23, FEX-Emu#12
0x0000000265a00368  aa1702d6    orr x22, x22, x23
0x0000000265a0036c  394b3b97    ldrb w23, [x28, FEX-Emu#718]
0x0000000265a00370  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00374  531246f7    lsl w23, w23, FEX-Emu#14
0x0000000265a00378  aa1702d6    orr x22, x22, x23
0x0000000265a0037c  394b4397    ldrb w23, [x28, FEX-Emu#720]
0x0000000265a00380  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00384  53103ef7    lsl w23, w23, FEX-Emu#16
0x0000000265a00388  aa1702d6    orr x22, x22, x23
0x0000000265a0038c  394b4797    ldrb w23, [x28, FEX-Emu#721]
0x0000000265a00390  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00394  530f3af7    lsl w23, w23, FEX-Emu#17
0x0000000265a00398  aa1702d6    orr x22, x22, x23
0x0000000265a0039c  394b4b97    ldrb w23, [x28, FEX-Emu#722]
0x0000000265a003a0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003a4  530e36f7    lsl w23, w23, FEX-Emu#18
0x0000000265a003a8  aa1702d6    orr x22, x22, x23
0x0000000265a003ac  394b4f97    ldrb w23, [x28, FEX-Emu#723]
0x0000000265a003b0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003b4  530d32f7    lsl w23, w23, FEX-Emu#19
0x0000000265a003b8  aa1702d6    orr x22, x22, x23
0x0000000265a003bc  394b5397    ldrb w23, [x28, FEX-Emu#724]
0x0000000265a003c0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003c4  530c2ef7    lsl w23, w23, FEX-Emu#20
0x0000000265a003c8  aa1702d6    orr x22, x22, x23
0x0000000265a003cc  394b5797    ldrb w23, [x28, FEX-Emu#725]
0x0000000265a003d0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003d4  530b2af7    lsl w23, w23, FEX-Emu#21
0x0000000265a003d8  aa1702d6    orr x22, x22, x23
0x0000000265a003dc  924002d6    and x22, x22, #0x1
0x0000000265a003e0  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a003e4  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a003e8  f10002df    cmp x22, #0x0 (0)
0x0000000265a003ec  9a950294    csel x20, x20, x21, eq
0x0000000265a003f0  4e080e84    dup v4.2d, x20
0x0000000265a003f4  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a003f8  91000695    add x21, x20, #0x1 (1)
0x0000000265a003fc  92400ab5    and x21, x21, #0x7
0x0000000265a00400  d2800200    mov x0, #0x10
0x0000000265a00404  9b007e80    mul x0, x20, x0
0x0000000265a00408  8b000380    add x0, x28, x0
0x0000000265a0040c  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a00410  d2800200    mov x0, #0x10
0x0000000265a00414  9b007ea0    mul x0, x21, x0
0x0000000265a00418  8b000380    add x0, x28, x0
0x0000000265a0041c  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a00420  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00424  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a00428  4ea01c04    mov v4.16b, v0.16b
0x0000000265a0042c  d2800200    mov x0, #0x10
0x0000000265a00430  9b007e80    mul x0, x20, x0
0x0000000265a00434  8b000380    add x0, x28, x0
0x0000000265a00438  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a0043c  58000040    ldr x0, pc+8 (addr 0x265a00444)
0x0000000265a00440  d63f0000    blr x0
```

New:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b1f97    ldrb w23, [x28, FEX-Emu#711]
0x0000000265a002d4  331902f6    bfi w22, w23, FEX-Emu#7, FEX-Emu#1
0x0000000265a002d8  394b2797    ldrb w23, [x28, FEX-Emu#713]
0x0000000265a002dc  331702f6    bfi w22, w23, FEX-Emu#9, FEX-Emu#1
0x0000000265a002e0  394b2f97    ldrb w23, [x28, FEX-Emu#715]
0x0000000265a002e4  331502f6    bfi w22, w23, FEX-Emu#11, FEX-Emu#1
0x0000000265a002e8  394b4797    ldrb w23, [x28, FEX-Emu#721]
0x0000000265a002ec  330f02f6    bfi w22, w23, FEX-Emu#17, FEX-Emu#1
0x0000000265a002f0  394b4f97    ldrb w23, [x28, FEX-Emu#723]
0x0000000265a002f4  330d02f6    bfi w22, w23, FEX-Emu#19, FEX-Emu#1
0x0000000265a002f8  394b5797    ldrb w23, [x28, FEX-Emu#725]
0x0000000265a002fc  330b02f6    bfi w22, w23, FEX-Emu#21, FEX-Emu#1
0x0000000265a00300  924002d6    and x22, x22, #0x1
0x0000000265a00304  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a00308  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a0030c  f10002df    cmp x22, #0x0 (0)
0x0000000265a00310  9a950294    csel x20, x20, x21, eq
0x0000000265a00314  4e080e84    dup v4.2d, x20
0x0000000265a00318  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a0031c  91000695    add x21, x20, #0x1 (1)
0x0000000265a00320  92400ab5    and x21, x21, #0x7
0x0000000265a00324  d2800200    mov x0, #0x10
0x0000000265a00328  9b007e80    mul x0, x20, x0
0x0000000265a0032c  8b000380    add x0, x28, x0
0x0000000265a00330  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a00334  d2800200    mov x0, #0x10
0x0000000265a00338  9b007ea0    mul x0, x21, x0
0x0000000265a0033c  8b000380    add x0, x28, x0
0x0000000265a00340  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a00344  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00348  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a0034c  4ea01c04    mov v4.16b, v0.16b
0x0000000265a00350  d2800200    mov x0, #0x10
0x0000000265a00354  9b007e80    mul x0, x20, x0
0x0000000265a00358  8b000380    add x0, x28, x0
0x0000000265a0035c  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a00360  58000040    ldr x0, pc+8 (addr 0x265a00368)
0x0000000265a00364  d63f0000    blr x0
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 7, 2023
Only return the particular flags that are being requested in the moment
since compacting them all when requested is fairly slow.

x87 fcmov in particular was requesting all the flags when it only needs
a couple.
This reduces a `fcmovb` instruction count blowup from 103x to 38x. Still
more room to go but this one stood out as being particularly bad.

Old:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b0397    ldrb w23, [x28, FEX-Emu#704]
0x0000000265a002d4  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002d8  aa1702d6    orr x22, x22, x23
0x0000000265a002dc  394b0b97    ldrb w23, [x28, FEX-Emu#706]
0x0000000265a002e0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002e4  531e76f7    lsl w23, w23, FEX-Emu#2
0x0000000265a002e8  aa1702d6    orr x22, x22, x23
0x0000000265a002ec  394b1397    ldrb w23, [x28, FEX-Emu#708]
0x0000000265a002f0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002f4  531c6ef7    lsl w23, w23, FEX-Emu#4
0x0000000265a002f8  aa1702d6    orr x22, x22, x23
0x0000000265a002fc  394b1b97    ldrb w23, [x28, FEX-Emu#710]
0x0000000265a00300  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00304  531a66f7    lsl w23, w23, FEX-Emu#6
0x0000000265a00308  aa1702d6    orr x22, x22, x23
0x0000000265a0030c  394b1f97    ldrb w23, [x28, FEX-Emu#711]
0x0000000265a00310  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00314  531962f7    lsl w23, w23, FEX-Emu#7
0x0000000265a00318  aa1702d6    orr x22, x22, x23
0x0000000265a0031c  394b2397    ldrb w23, [x28, FEX-Emu#712]
0x0000000265a00320  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00324  53185ef7    lsl w23, w23, FEX-Emu#8
0x0000000265a00328  aa1702d6    orr x22, x22, x23
0x0000000265a0032c  394b2797    ldrb w23, [x28, FEX-Emu#713]
0x0000000265a00330  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00334  53175af7    lsl w23, w23, FEX-Emu#9
0x0000000265a00338  aa1702d6    orr x22, x22, x23
0x0000000265a0033c  394b2b97    ldrb w23, [x28, FEX-Emu#714]
0x0000000265a00340  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00344  531656f7    lsl w23, w23, FEX-Emu#10
0x0000000265a00348  aa1702d6    orr x22, x22, x23
0x0000000265a0034c  394b2f97    ldrb w23, [x28, FEX-Emu#715]
0x0000000265a00350  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00354  531552f7    lsl w23, w23, FEX-Emu#11
0x0000000265a00358  aa1702d6    orr x22, x22, x23
0x0000000265a0035c  394b3397    ldrb w23, [x28, FEX-Emu#716]
0x0000000265a00360  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00364  53144ef7    lsl w23, w23, FEX-Emu#12
0x0000000265a00368  aa1702d6    orr x22, x22, x23
0x0000000265a0036c  394b3b97    ldrb w23, [x28, FEX-Emu#718]
0x0000000265a00370  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00374  531246f7    lsl w23, w23, FEX-Emu#14
0x0000000265a00378  aa1702d6    orr x22, x22, x23
0x0000000265a0037c  394b4397    ldrb w23, [x28, FEX-Emu#720]
0x0000000265a00380  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00384  53103ef7    lsl w23, w23, FEX-Emu#16
0x0000000265a00388  aa1702d6    orr x22, x22, x23
0x0000000265a0038c  394b4797    ldrb w23, [x28, FEX-Emu#721]
0x0000000265a00390  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00394  530f3af7    lsl w23, w23, FEX-Emu#17
0x0000000265a00398  aa1702d6    orr x22, x22, x23
0x0000000265a0039c  394b4b97    ldrb w23, [x28, FEX-Emu#722]
0x0000000265a003a0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003a4  530e36f7    lsl w23, w23, FEX-Emu#18
0x0000000265a003a8  aa1702d6    orr x22, x22, x23
0x0000000265a003ac  394b4f97    ldrb w23, [x28, FEX-Emu#723]
0x0000000265a003b0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003b4  530d32f7    lsl w23, w23, FEX-Emu#19
0x0000000265a003b8  aa1702d6    orr x22, x22, x23
0x0000000265a003bc  394b5397    ldrb w23, [x28, FEX-Emu#724]
0x0000000265a003c0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003c4  530c2ef7    lsl w23, w23, FEX-Emu#20
0x0000000265a003c8  aa1702d6    orr x22, x22, x23
0x0000000265a003cc  394b5797    ldrb w23, [x28, FEX-Emu#725]
0x0000000265a003d0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003d4  530b2af7    lsl w23, w23, FEX-Emu#21
0x0000000265a003d8  aa1702d6    orr x22, x22, x23
0x0000000265a003dc  924002d6    and x22, x22, #0x1
0x0000000265a003e0  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a003e4  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a003e8  f10002df    cmp x22, #0x0 (0)
0x0000000265a003ec  9a950294    csel x20, x20, x21, eq
0x0000000265a003f0  4e080e84    dup v4.2d, x20
0x0000000265a003f4  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a003f8  91000695    add x21, x20, #0x1 (1)
0x0000000265a003fc  92400ab5    and x21, x21, #0x7
0x0000000265a00400  d2800200    mov x0, #0x10
0x0000000265a00404  9b007e80    mul x0, x20, x0
0x0000000265a00408  8b000380    add x0, x28, x0
0x0000000265a0040c  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a00410  d2800200    mov x0, #0x10
0x0000000265a00414  9b007ea0    mul x0, x21, x0
0x0000000265a00418  8b000380    add x0, x28, x0
0x0000000265a0041c  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a00420  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00424  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a00428  4ea01c04    mov v4.16b, v0.16b
0x0000000265a0042c  d2800200    mov x0, #0x10
0x0000000265a00430  9b007e80    mul x0, x20, x0
0x0000000265a00434  8b000380    add x0, x28, x0
0x0000000265a00438  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a0043c  58000040    ldr x0, pc+8 (addr 0x265a00444)
0x0000000265a00440  d63f0000    blr x0
```

New:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b0397    ldrb w23, [x28, FEX-Emu#704]
0x0000000265a002d4  330002f6    bfxil w22, w23, #0, FEX-Emu#1
0x0000000265a002d8  924002d6    and x22, x22, #0x1
0x0000000265a002dc  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a002e0  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a002e4  f10002df    cmp x22, #0x0 (0)
0x0000000265a002e8  9a950294    csel x20, x20, x21, eq
0x0000000265a002ec  4e080e84    dup v4.2d, x20
0x0000000265a002f0  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a002f4  91000695    add x21, x20, #0x1 (1)
0x0000000265a002f8  92400ab5    and x21, x21, #0x7
0x0000000265a002fc  d2800200    mov x0, #0x10
0x0000000265a00300  9b007e80    mul x0, x20, x0
0x0000000265a00304  8b000380    add x0, x28, x0
0x0000000265a00308  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a0030c  d2800200    mov x0, #0x10
0x0000000265a00310  9b007ea0    mul x0, x21, x0
0x0000000265a00314  8b000380    add x0, x28, x0
0x0000000265a00318  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a0031c  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00320  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a00324  4ea01c04    mov v4.16b, v0.16b
0x0000000265a00328  d2800200    mov x0, #0x10
0x0000000265a0032c  9b007e80    mul x0, x20, x0
0x0000000265a00330  8b000380    add x0, x28, x0
0x0000000265a00334  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a00338  58000040    ldr x0, pc+8 (addr 0x265a00340)
0x0000000265a0033c  d63f0000    blr x0
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 8, 2023
Only return the particular flags that are being requested in the moment
since compacting them all when requested is fairly slow.

x87 fcmov in particular was requesting all the flags when it only needs
a couple.
This reduces a `fcmovb` instruction count blowup from 103x to 38x. Still
more room to go but this one stood out as being particularly bad.

Old:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b0397    ldrb w23, [x28, FEX-Emu#704]
0x0000000265a002d4  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002d8  aa1702d6    orr x22, x22, x23
0x0000000265a002dc  394b0b97    ldrb w23, [x28, FEX-Emu#706]
0x0000000265a002e0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002e4  531e76f7    lsl w23, w23, FEX-Emu#2
0x0000000265a002e8  aa1702d6    orr x22, x22, x23
0x0000000265a002ec  394b1397    ldrb w23, [x28, FEX-Emu#708]
0x0000000265a002f0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a002f4  531c6ef7    lsl w23, w23, FEX-Emu#4
0x0000000265a002f8  aa1702d6    orr x22, x22, x23
0x0000000265a002fc  394b1b97    ldrb w23, [x28, FEX-Emu#710]
0x0000000265a00300  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00304  531a66f7    lsl w23, w23, FEX-Emu#6
0x0000000265a00308  aa1702d6    orr x22, x22, x23
0x0000000265a0030c  394b1f97    ldrb w23, [x28, FEX-Emu#711]
0x0000000265a00310  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00314  531962f7    lsl w23, w23, FEX-Emu#7
0x0000000265a00318  aa1702d6    orr x22, x22, x23
0x0000000265a0031c  394b2397    ldrb w23, [x28, FEX-Emu#712]
0x0000000265a00320  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00324  53185ef7    lsl w23, w23, FEX-Emu#8
0x0000000265a00328  aa1702d6    orr x22, x22, x23
0x0000000265a0032c  394b2797    ldrb w23, [x28, FEX-Emu#713]
0x0000000265a00330  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00334  53175af7    lsl w23, w23, FEX-Emu#9
0x0000000265a00338  aa1702d6    orr x22, x22, x23
0x0000000265a0033c  394b2b97    ldrb w23, [x28, FEX-Emu#714]
0x0000000265a00340  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00344  531656f7    lsl w23, w23, FEX-Emu#10
0x0000000265a00348  aa1702d6    orr x22, x22, x23
0x0000000265a0034c  394b2f97    ldrb w23, [x28, FEX-Emu#715]
0x0000000265a00350  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00354  531552f7    lsl w23, w23, FEX-Emu#11
0x0000000265a00358  aa1702d6    orr x22, x22, x23
0x0000000265a0035c  394b3397    ldrb w23, [x28, FEX-Emu#716]
0x0000000265a00360  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00364  53144ef7    lsl w23, w23, FEX-Emu#12
0x0000000265a00368  aa1702d6    orr x22, x22, x23
0x0000000265a0036c  394b3b97    ldrb w23, [x28, FEX-Emu#718]
0x0000000265a00370  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00374  531246f7    lsl w23, w23, FEX-Emu#14
0x0000000265a00378  aa1702d6    orr x22, x22, x23
0x0000000265a0037c  394b4397    ldrb w23, [x28, FEX-Emu#720]
0x0000000265a00380  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00384  53103ef7    lsl w23, w23, FEX-Emu#16
0x0000000265a00388  aa1702d6    orr x22, x22, x23
0x0000000265a0038c  394b4797    ldrb w23, [x28, FEX-Emu#721]
0x0000000265a00390  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a00394  530f3af7    lsl w23, w23, FEX-Emu#17
0x0000000265a00398  aa1702d6    orr x22, x22, x23
0x0000000265a0039c  394b4b97    ldrb w23, [x28, FEX-Emu#722]
0x0000000265a003a0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003a4  530e36f7    lsl w23, w23, FEX-Emu#18
0x0000000265a003a8  aa1702d6    orr x22, x22, x23
0x0000000265a003ac  394b4f97    ldrb w23, [x28, FEX-Emu#723]
0x0000000265a003b0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003b4  530d32f7    lsl w23, w23, FEX-Emu#19
0x0000000265a003b8  aa1702d6    orr x22, x22, x23
0x0000000265a003bc  394b5397    ldrb w23, [x28, FEX-Emu#724]
0x0000000265a003c0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003c4  530c2ef7    lsl w23, w23, FEX-Emu#20
0x0000000265a003c8  aa1702d6    orr x22, x22, x23
0x0000000265a003cc  394b5797    ldrb w23, [x28, FEX-Emu#725]
0x0000000265a003d0  d3407ef7    ubfx x23, x23, #0, FEX-Emu#32
0x0000000265a003d4  530b2af7    lsl w23, w23, FEX-Emu#21
0x0000000265a003d8  aa1702d6    orr x22, x22, x23
0x0000000265a003dc  924002d6    and x22, x22, #0x1
0x0000000265a003e0  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a003e4  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a003e8  f10002df    cmp x22, #0x0 (0)
0x0000000265a003ec  9a950294    csel x20, x20, x21, eq
0x0000000265a003f0  4e080e84    dup v4.2d, x20
0x0000000265a003f4  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a003f8  91000695    add x21, x20, #0x1 (1)
0x0000000265a003fc  92400ab5    and x21, x21, #0x7
0x0000000265a00400  d2800200    mov x0, #0x10
0x0000000265a00404  9b007e80    mul x0, x20, x0
0x0000000265a00408  8b000380    add x0, x28, x0
0x0000000265a0040c  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a00410  d2800200    mov x0, #0x10
0x0000000265a00414  9b007ea0    mul x0, x21, x0
0x0000000265a00418  8b000380    add x0, x28, x0
0x0000000265a0041c  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a00420  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00424  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a00428  4ea01c04    mov v4.16b, v0.16b
0x0000000265a0042c  d2800200    mov x0, #0x10
0x0000000265a00430  9b007e80    mul x0, x20, x0
0x0000000265a00434  8b000380    add x0, x28, x0
0x0000000265a00438  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a0043c  58000040    ldr x0, pc+8 (addr 0x265a00444)
0x0000000265a00440  d63f0000    blr x0
```

New:
```asm
0x0000000265a002bc  10ffffe0    adr x0, #-0x4 (addr 0x265a002b8)
0x0000000265a002c0  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000000265a002c4  d2800014    mov x20, #0x0
0x0000000265a002c8  d2800035    mov x21, #0x1
0x0000000265a002cc  d2800056    mov x22, #0x2
0x0000000265a002d0  394b0397    ldrb w23, [x28, FEX-Emu#704]
0x0000000265a002d4  330002f6    bfxil w22, w23, #0, FEX-Emu#1
0x0000000265a002d8  924002d6    and x22, x22, #0x1
0x0000000265a002dc  93400294    sbfx x20, x20, #0, FEX-Emu#1
0x0000000265a002e0  934002b5    sbfx x21, x21, #0, FEX-Emu#1
0x0000000265a002e4  f10002df    cmp x22, #0x0 (0)
0x0000000265a002e8  9a950294    csel x20, x20, x21, eq
0x0000000265a002ec  4e080e84    dup v4.2d, x20
0x0000000265a002f0  394baf94    ldrb w20, [x28, FEX-Emu#747]
0x0000000265a002f4  91000695    add x21, x20, #0x1 (1)
0x0000000265a002f8  92400ab5    and x21, x21, #0x7
0x0000000265a002fc  d2800200    mov x0, #0x10
0x0000000265a00300  9b007e80    mul x0, x20, x0
0x0000000265a00304  8b000380    add x0, x28, x0
0x0000000265a00308  3dc0bc05    ldr q5, [x0, FEX-Emu#752]
0x0000000265a0030c  d2800200    mov x0, #0x10
0x0000000265a00310  9b007ea0    mul x0, x21, x0
0x0000000265a00314  8b000380    add x0, x28, x0
0x0000000265a00318  3dc0bc06    ldr q6, [x0, FEX-Emu#752]
0x0000000265a0031c  4ea41c80    mov v0.16b, v4.16b
0x0000000265a00320  6e651cc0    bsl v0.16b, v6.16b, v5.16b
0x0000000265a00324  4ea01c04    mov v4.16b, v0.16b
0x0000000265a00328  d2800200    mov x0, #0x10
0x0000000265a0032c  9b007e80    mul x0, x20, x0
0x0000000265a00330  8b000380    add x0, x28, x0
0x0000000265a00334  3d80bc04    str q4, [x0, FEX-Emu#752]
0x0000000265a00338  58000040    ldr x0, pc+8 (addr 0x265a00340)
0x0000000265a0033c  d63f0000    blr x0
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 8, 2023
This previous implementation was particularly gnarly. Because these
instructions are both weackly ordered and have implementation dependent
exception and trap behaviour these can actually be fairly conveniently
converted over to a load + cmlt + bsl + str instruction.

For the XMM variant this reduces code blowup from 80x to 15x!
For the MMX variant this reduces code blowup from 46x to 17x!

Both of these improvements are significant wins! There's still some
minor improvement that could be done with bsl that requires some
redundant moves, but since we don't have constraint support for this we
still eat two additional instructions

Before:
```asm
0x0000ffff7b800718  10ffffe0    adr x0, #-0x4 (addr 0xffff7b800714)
0x0000ffff7b80071c  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff7b800720  4eb11e24    mov v4.16b, v17.16b
0x0000ffff7b800724  4eb01e05    mov v5.16b, v16.16b
0x0000ffff7b800728  aa0b03f4    mov x20, x11
0x0000ffff7b80072c  4e083c95    mov x21, v4.d[0]
0x0000ffff7b800730  4e083cb6    mov x22, v5.d[0]
0x0000ffff7b800734  d3471eb7    ubfx x23, x21, FEX-Emu#7, FEX-Emu#1
0x0000ffff7b800738  b4000077    cbz x23, #+0xc (addr 0xffff7b800744)
0x0000ffff7b80073c  d3401ed7    uxtb x23, w22
0x0000ffff7b800740  39000297    strb w23, [x20]
0x0000ffff7b800744  d34f3eb7    ubfx x23, x21, FEX-Emu#15, FEX-Emu#1
0x0000ffff7b800748  b4000077    cbz x23, #+0xc (addr 0xffff7b800754)
0x0000ffff7b80074c  d3483ed7    ubfx x23, x22, FEX-Emu#8, FEX-Emu#8
0x0000ffff7b800750  39000697    strb w23, [x20, FEX-Emu#1]
0x0000ffff7b800754  d3575eb7    ubfx x23, x21, FEX-Emu#23, FEX-Emu#1
0x0000ffff7b800758  b4000077    cbz x23, #+0xc (addr 0xffff7b800764)
0x0000ffff7b80075c  d3505ed7    ubfx x23, x22, FEX-Emu#16, FEX-Emu#8
0x0000ffff7b800760  39000a97    strb w23, [x20, FEX-Emu#2]
0x0000ffff7b800764  d35f7eb7    ubfx x23, x21, FEX-Emu#31, FEX-Emu#1
0x0000ffff7b800768  b4000077    cbz x23, #+0xc (addr 0xffff7b800774)
0x0000ffff7b80076c  d3587ed7    ubfx x23, x22, FEX-Emu#24, FEX-Emu#8
0x0000ffff7b800770  39000e97    strb w23, [x20, FEX-Emu#3]
0x0000ffff7b800774  d3679eb7    ubfx x23, x21, FEX-Emu#39, FEX-Emu#1
0x0000ffff7b800778  b4000077    cbz x23, #+0xc (addr 0xffff7b800784)
0x0000ffff7b80077c  d3609ed7    ubfx x23, x22, FEX-Emu#32, FEX-Emu#8
0x0000ffff7b800780  39001297    strb w23, [x20, FEX-Emu#4]
0x0000ffff7b800784  d36fbeb7    ubfx x23, x21, FEX-Emu#47, FEX-Emu#1
0x0000ffff7b800788  b4000077    cbz x23, #+0xc (addr 0xffff7b800794)
0x0000ffff7b80078c  d368bed7    ubfx x23, x22, FEX-Emu#40, FEX-Emu#8
0x0000ffff7b800790  39001697    strb w23, [x20, FEX-Emu#5]
0x0000ffff7b800794  d377deb7    ubfx x23, x21, FEX-Emu#55, FEX-Emu#1
0x0000ffff7b800798  b4000077    cbz x23, #+0xc (addr 0xffff7b8007a4)
0x0000ffff7b80079c  d370ded7    ubfx x23, x22, FEX-Emu#48, FEX-Emu#8
0x0000ffff7b8007a0  39001a97    strb w23, [x20, FEX-Emu#6]
0x0000ffff7b8007a4  d37ffeb5    lsr x21, x21, FEX-Emu#63
0x0000ffff7b8007a8  b4000075    cbz x21, #+0xc (addr 0xffff7b8007b4)
0x0000ffff7b8007ac  d378fed5    lsr x21, x22, FEX-Emu#56
0x0000ffff7b8007b0  39001e95    strb w21, [x20, FEX-Emu#7]
0x0000ffff7b8007b4  4e183c95    mov x21, v4.d[1]
0x0000ffff7b8007b8  4e183cb6    mov x22, v5.d[1]
0x0000ffff7b8007bc  d3471eb7    ubfx x23, x21, FEX-Emu#7, FEX-Emu#1
0x0000ffff7b8007c0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007cc)
0x0000ffff7b8007c4  d3401ed7    uxtb x23, w22
0x0000ffff7b8007c8  39002297    strb w23, [x20, FEX-Emu#8]
0x0000ffff7b8007cc  d34f3eb7    ubfx x23, x21, FEX-Emu#15, FEX-Emu#1
0x0000ffff7b8007d0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007dc)
0x0000ffff7b8007d4  d3483ed7    ubfx x23, x22, FEX-Emu#8, FEX-Emu#8
0x0000ffff7b8007d8  39002697    strb w23, [x20, FEX-Emu#9]
0x0000ffff7b8007dc  d3575eb7    ubfx x23, x21, FEX-Emu#23, FEX-Emu#1
0x0000ffff7b8007e0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007ec)
0x0000ffff7b8007e4  d3505ed7    ubfx x23, x22, FEX-Emu#16, FEX-Emu#8
0x0000ffff7b8007e8  39002a97    strb w23, [x20, FEX-Emu#10]
0x0000ffff7b8007ec  d35f7eb7    ubfx x23, x21, FEX-Emu#31, FEX-Emu#1
0x0000ffff7b8007f0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007fc)
0x0000ffff7b8007f4  d3587ed7    ubfx x23, x22, FEX-Emu#24, FEX-Emu#8
0x0000ffff7b8007f8  39002e97    strb w23, [x20, FEX-Emu#11]
0x0000ffff7b8007fc  d3679eb7    ubfx x23, x21, FEX-Emu#39, FEX-Emu#1
0x0000ffff7b800800  b4000077    cbz x23, #+0xc (addr 0xffff7b80080c)
0x0000ffff7b800804  d3609ed7    ubfx x23, x22, FEX-Emu#32, FEX-Emu#8
0x0000ffff7b800808  39003297    strb w23, [x20, FEX-Emu#12]
0x0000ffff7b80080c  d36fbeb7    ubfx x23, x21, FEX-Emu#47, FEX-Emu#1
0x0000ffff7b800810  b4000077    cbz x23, #+0xc (addr 0xffff7b80081c)
0x0000ffff7b800814  d368bed7    ubfx x23, x22, FEX-Emu#40, FEX-Emu#8
0x0000ffff7b800818  39003697    strb w23, [x20, FEX-Emu#13]
0x0000ffff7b80081c  d377deb7    ubfx x23, x21, FEX-Emu#55, FEX-Emu#1
0x0000ffff7b800820  b4000077    cbz x23, #+0xc (addr 0xffff7b80082c)
0x0000ffff7b800824  d370ded7    ubfx x23, x22, FEX-Emu#48, FEX-Emu#8
0x0000ffff7b800828  39003a97    strb w23, [x20, FEX-Emu#14]
0x0000ffff7b80082c  d37ffeb5    lsr x21, x21, FEX-Emu#63
0x0000ffff7b800830  b4000075    cbz x21, #+0xc (addr 0xffff7b80083c)
0x0000ffff7b800834  d378fed5    lsr x21, x22, FEX-Emu#56
0x0000ffff7b800838  39003e95    strb w21, [x20, FEX-Emu#15]
0x0000ffff7b80083c  58000040    ldr x0, pc+8 (addr 0xffff7b800844)
0x0000ffff7b800840  d63f0000    blr x0
```

After:
```asm
0x0000ffff7ac00718  10ffffe0            adr x0, #-0x4 (addr 0xffff7ac00714)
0x0000ffff7ac0071c  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffff7ac00720  4e20aa24            cmlt v4.16b, v17.16b, #0
0x0000ffff7ac00724  3dc00165            ldr q5, [x11]
0x0000ffff7ac00728  4ea41c80            mov v0.16b, v4.16b
0x0000ffff7ac0072c  6e651e00            bsl v0.16b, v16.16b, v5.16b
0x0000ffff7ac00730  4ea01c04            mov v4.16b, v0.16b
0x0000ffff7ac00734  3d800164            str q4, [x11]
0x0000ffff7ac00738  58000040            ldr x0, pc+8 (addr 0xffff7ac00740)
0x0000ffff7ac0073c  d63f0000            blr x0
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 16, 2023
The BFI cascades in this particular instruction weren't optimal.
Biggest improvement is the 8-bit version, while the 16-bit version gets
a minor improvement.

8-bit instruction count reduced from 38 to 29.
16-bit instruction count reduced from 34 to 28.

RCL can have a similar optimization done to it.
```asm
Before 16-bit:
0x0000ffff80a801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff80a801dc)
0x0000ffff80a801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff80a801e8  d3403cb4    uxth x20, w5
0x0000ffff80a801ec  d3403cf5    uxth x21, w7
0x0000ffff80a801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff80a801f4  12001294    and w20, w20, #0x1f
0x0000ffff80a801f8  d2800017    mov x23, #0x0
0x0000ffff80a801fc  b3403eb7    bfxil x23, x21, #0, FEX-Emu#16
0x0000ffff80a80200  b37002d7    bfi x23, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffff80a80204  b36f3eb7    bfi x23, x21, FEX-Emu#17, FEX-Emu#16
0x0000ffff80a80208  b35f02d7    bfi x23, x22, FEX-Emu#33, FEX-Emu#1
0x0000ffff80a8020c  aa1703e0    mov x0, x23
0x0000ffff80a80210  b35e3ea0    bfi x0, x21, FEX-Emu#34, FEX-Emu#16
0x0000ffff80a80214  aa0003f5    mov x21, x0
0x0000ffff80a80218  b34e02d5    bfi x21, x22, FEX-Emu#50, FEX-Emu#1
0x0000ffff80a8021c  9ad426b7    lsr x23, x21, x20
0x0000ffff80a80220  b3403ee7    bfxil x7, x23, #0, FEX-Emu#16
0x0000ffff80a80224  51000698    sub w24, w20, #0x1 (1)
0x0000ffff80a80228  9ad826b5    lsr x21, x21, x24
0x0000ffff80a8022c  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff80a80230  7100069f    cmp w20, #0x1 (1)
0x0000ffff80a80234  9a9622b4    csel x20, x21, x22, hs
0x0000ffff80a80238  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff80a8023c  d34f3ef4    ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffff80a80240  d34e3af5    ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffff80a80244  ca150294    eor x20, x20, x21
0x0000ffff80a80248  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff80a8024c  58000040    ldr x0, pc+8 (addr 0xffff80a80254)
0x0000ffff80a80250  d63f0000    blr x0
0x0000ffff80a80254  967da128    bl #-0x6097b60 (addr 0xffff7a9e86f4)
0x0000ffff80a80258  0000ffff    udf #0xffff
0x0000ffff80a8025c  00010023    unallocated (Unallocated)
0x0000ffff80a80260  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 34
[DEBUG] Blow-up Amt: 34x

After 16-bit:
0x0000ffffa7c801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa7c801dc)
0x0000ffffa7c801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa7c801e8  d3403cb4            uxth x20, w5
0x0000ffffa7c801ec  d3403cf5            uxth x21, w7
0x0000ffffa7c801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa7c801f4  12001294            and w20, w20, #0x1f
0x0000ffffa7c801f8  b37002d5            bfi x21, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffffa7c801fc  b36f42b5            bfi x21, x21, FEX-Emu#17, FEX-Emu#17
0x0000ffffa7c80200  b35e42b5            bfi x21, x21, FEX-Emu#34, FEX-Emu#17
0x0000ffffa7c80204  9ad426b7            lsr x23, x21, x20
0x0000ffffa7c80208  b3403ee7            bfxil x7, x23, #0, FEX-Emu#16
0x0000ffffa7c8020c  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa7c80210  9ad826b5            lsr x21, x21, x24
0x0000ffffa7c80214  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa7c80218  7100069f            cmp w20, #0x1 (1)
0x0000ffffa7c8021c  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa7c80220  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa7c80224  d34f3ef4            ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffffa7c80228  d34e3af5            ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffffa7c8022c  ca150294            eor x20, x20, x21
0x0000ffffa7c80230  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa7c80234  58000040            ldr x0, pc+8 (addr 0xffffa7c8023c)
0x0000ffffa7c80238  d63f0000            blr x0
0x0000ffffa7c8023c  bd9cc128            unallocated (Unallocated)
0x0000ffffa7c80240  0000ffff            udf #0xffff
0x0000ffffa7c80244  00010023            unallocated (Unallocated)
0x0000ffffa7c80248  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 28
[DEBUG] Blow-up Amt: 28x

Before 8-bit:
0x0000ffffa92801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa92801dc)
0x0000ffffa92801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa92801e8  d3401cb4            uxtb x20, w5
0x0000ffffa92801ec  d3401cf5            uxtb x21, w7
0x0000ffffa92801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa92801f4  12001294            and w20, w20, #0x1f
0x0000ffffa92801f8  d2800017            mov x23, #0x0
0x0000ffffa92801fc  b3401eb7            bfxil x23, x21, #0, FEX-Emu#8
0x0000ffffa9280200  b37802d7            bfi x23, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffffa9280204  b3771eb7            bfi x23, x21, FEX-Emu#9, FEX-Emu#8
0x0000ffffa9280208  b36f02d7            bfi x23, x22, FEX-Emu#17, FEX-Emu#1
0x0000ffffa928020c  b36e1eb7            bfi x23, x21, FEX-Emu#18, FEX-Emu#8
0x0000ffffa9280210  b36602d7            bfi x23, x22, FEX-Emu#26, FEX-Emu#1
0x0000ffffa9280214  b3651eb7            bfi x23, x21, FEX-Emu#27, FEX-Emu#8
0x0000ffffa9280218  b35d02d7            bfi x23, x22, FEX-Emu#35, FEX-Emu#1
0x0000ffffa928021c  aa1703e0            mov x0, x23
0x0000ffffa9280220  b35c1ea0            bfi x0, x21, FEX-Emu#36, FEX-Emu#8
0x0000ffffa9280224  aa0003f5            mov x21, x0
0x0000ffffa9280228  b35402d5            bfi x21, x22, FEX-Emu#44, FEX-Emu#1
0x0000ffffa928022c  9ad426b7            lsr x23, x21, x20
0x0000ffffa9280230  b3401ee7            bfxil x7, x23, #0, FEX-Emu#8
0x0000ffffa9280234  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa9280238  9ad826b5            lsr x21, x21, x24
0x0000ffffa928023c  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa9280240  7100069f            cmp w20, #0x1 (1)
0x0000ffffa9280244  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa9280248  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa928024c  d3471ef4            ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffffa9280250  d3461af5            ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffffa9280254  ca150294            eor x20, x20, x21
0x0000ffffa9280258  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa928025c  58000040            ldr x0, pc+8 (addr 0xffffa9280264)
0x0000ffffa9280260  d63f0000            blr x0
0x0000ffffa9280264  bf062128            unallocated (Unallocated)
0x0000ffffa9280268  0000ffff            udf #0xffff
0x0000ffffa928026c  00010022            unallocated (Unallocated)
0x0000ffffa9280270  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 38
[DEBUG] Blow-up Amt: 38x

After 8-bit:
0x0000ffff9cc801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff9cc801dc)
0x0000ffff9cc801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff9cc801e8  d3401cb4    uxtb x20, w5
0x0000ffff9cc801ec  d3401cf5    uxtb x21, w7
0x0000ffff9cc801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff9cc801f4  12001294    and w20, w20, #0x1f
0x0000ffff9cc801f8  b37802d5    bfi x21, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffff9cc801fc  b37722b5    bfi x21, x21, FEX-Emu#9, FEX-Emu#9
0x0000ffff9cc80200  b36e46b5    bfi x21, x21, FEX-Emu#18, FEX-Emu#18
0x0000ffff9cc80204  b3778eb5    bfi x21, x21, FEX-Emu#9, FEX-Emu#36
0x0000ffff9cc80208  9ad426b7    lsr x23, x21, x20
0x0000ffff9cc8020c  b3401ee7    bfxil x7, x23, #0, FEX-Emu#8
0x0000ffff9cc80210  51000698    sub w24, w20, #0x1 (1)
0x0000ffff9cc80214  9ad826b5    lsr x21, x21, x24
0x0000ffff9cc80218  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff9cc8021c  7100069f    cmp w20, #0x1 (1)
0x0000ffff9cc80220  9a9622b4    csel x20, x21, x22, hs
0x0000ffff9cc80224  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff9cc80228  d3471ef4    ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffff9cc8022c  d3461af5    ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffff9cc80230  ca150294    eor x20, x20, x21
0x0000ffff9cc80234  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff9cc80238  58000040    ldr x0, pc+8 (addr 0xffff9cc80240)
0x0000ffff9cc8023c  d63f0000    blr x0
0x0000ffff9cc80240  b2a75128    unallocated (Unallocated)
0x0000ffff9cc80244  0000ffff    udf #0xffff
0x0000ffff9cc80248  00010022    unallocated (Unallocated)
0x0000ffff9cc8024c  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 29
[DEBUG] Blow-up Amt: 29x
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 18, 2023
The BFI cascades in this particular instruction weren't optimal.
Biggest improvement is the 8-bit version, while the 16-bit version gets
a minor improvement.

8-bit instruction count reduced from 38 to 29.
16-bit instruction count reduced from 34 to 28.

RCL can have a similar optimization done to it.
```asm
Before 16-bit:
0x0000ffff80a801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff80a801dc)
0x0000ffff80a801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff80a801e8  d3403cb4    uxth x20, w5
0x0000ffff80a801ec  d3403cf5    uxth x21, w7
0x0000ffff80a801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff80a801f4  12001294    and w20, w20, #0x1f
0x0000ffff80a801f8  d2800017    mov x23, #0x0
0x0000ffff80a801fc  b3403eb7    bfxil x23, x21, #0, FEX-Emu#16
0x0000ffff80a80200  b37002d7    bfi x23, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffff80a80204  b36f3eb7    bfi x23, x21, FEX-Emu#17, FEX-Emu#16
0x0000ffff80a80208  b35f02d7    bfi x23, x22, FEX-Emu#33, FEX-Emu#1
0x0000ffff80a8020c  aa1703e0    mov x0, x23
0x0000ffff80a80210  b35e3ea0    bfi x0, x21, FEX-Emu#34, FEX-Emu#16
0x0000ffff80a80214  aa0003f5    mov x21, x0
0x0000ffff80a80218  b34e02d5    bfi x21, x22, FEX-Emu#50, FEX-Emu#1
0x0000ffff80a8021c  9ad426b7    lsr x23, x21, x20
0x0000ffff80a80220  b3403ee7    bfxil x7, x23, #0, FEX-Emu#16
0x0000ffff80a80224  51000698    sub w24, w20, #0x1 (1)
0x0000ffff80a80228  9ad826b5    lsr x21, x21, x24
0x0000ffff80a8022c  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff80a80230  7100069f    cmp w20, #0x1 (1)
0x0000ffff80a80234  9a9622b4    csel x20, x21, x22, hs
0x0000ffff80a80238  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff80a8023c  d34f3ef4    ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffff80a80240  d34e3af5    ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffff80a80244  ca150294    eor x20, x20, x21
0x0000ffff80a80248  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff80a8024c  58000040    ldr x0, pc+8 (addr 0xffff80a80254)
0x0000ffff80a80250  d63f0000    blr x0
0x0000ffff80a80254  967da128    bl #-0x6097b60 (addr 0xffff7a9e86f4)
0x0000ffff80a80258  0000ffff    udf #0xffff
0x0000ffff80a8025c  00010023    unallocated (Unallocated)
0x0000ffff80a80260  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 34
[DEBUG] Blow-up Amt: 34x

After 16-bit:
0x0000ffffa7c801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa7c801dc)
0x0000ffffa7c801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa7c801e8  d3403cb4            uxth x20, w5
0x0000ffffa7c801ec  d3403cf5            uxth x21, w7
0x0000ffffa7c801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa7c801f4  12001294            and w20, w20, #0x1f
0x0000ffffa7c801f8  b37002d5            bfi x21, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffffa7c801fc  b36f42b5            bfi x21, x21, FEX-Emu#17, FEX-Emu#17
0x0000ffffa7c80200  b35e42b5            bfi x21, x21, FEX-Emu#34, FEX-Emu#17
0x0000ffffa7c80204  9ad426b7            lsr x23, x21, x20
0x0000ffffa7c80208  b3403ee7            bfxil x7, x23, #0, FEX-Emu#16
0x0000ffffa7c8020c  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa7c80210  9ad826b5            lsr x21, x21, x24
0x0000ffffa7c80214  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa7c80218  7100069f            cmp w20, #0x1 (1)
0x0000ffffa7c8021c  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa7c80220  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa7c80224  d34f3ef4            ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffffa7c80228  d34e3af5            ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffffa7c8022c  ca150294            eor x20, x20, x21
0x0000ffffa7c80230  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa7c80234  58000040            ldr x0, pc+8 (addr 0xffffa7c8023c)
0x0000ffffa7c80238  d63f0000            blr x0
0x0000ffffa7c8023c  bd9cc128            unallocated (Unallocated)
0x0000ffffa7c80240  0000ffff            udf #0xffff
0x0000ffffa7c80244  00010023            unallocated (Unallocated)
0x0000ffffa7c80248  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 28
[DEBUG] Blow-up Amt: 28x

Before 8-bit:
0x0000ffffa92801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa92801dc)
0x0000ffffa92801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa92801e8  d3401cb4            uxtb x20, w5
0x0000ffffa92801ec  d3401cf5            uxtb x21, w7
0x0000ffffa92801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa92801f4  12001294            and w20, w20, #0x1f
0x0000ffffa92801f8  d2800017            mov x23, #0x0
0x0000ffffa92801fc  b3401eb7            bfxil x23, x21, #0, FEX-Emu#8
0x0000ffffa9280200  b37802d7            bfi x23, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffffa9280204  b3771eb7            bfi x23, x21, FEX-Emu#9, FEX-Emu#8
0x0000ffffa9280208  b36f02d7            bfi x23, x22, FEX-Emu#17, FEX-Emu#1
0x0000ffffa928020c  b36e1eb7            bfi x23, x21, FEX-Emu#18, FEX-Emu#8
0x0000ffffa9280210  b36602d7            bfi x23, x22, FEX-Emu#26, FEX-Emu#1
0x0000ffffa9280214  b3651eb7            bfi x23, x21, FEX-Emu#27, FEX-Emu#8
0x0000ffffa9280218  b35d02d7            bfi x23, x22, FEX-Emu#35, FEX-Emu#1
0x0000ffffa928021c  aa1703e0            mov x0, x23
0x0000ffffa9280220  b35c1ea0            bfi x0, x21, FEX-Emu#36, FEX-Emu#8
0x0000ffffa9280224  aa0003f5            mov x21, x0
0x0000ffffa9280228  b35402d5            bfi x21, x22, FEX-Emu#44, FEX-Emu#1
0x0000ffffa928022c  9ad426b7            lsr x23, x21, x20
0x0000ffffa9280230  b3401ee7            bfxil x7, x23, #0, FEX-Emu#8
0x0000ffffa9280234  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa9280238  9ad826b5            lsr x21, x21, x24
0x0000ffffa928023c  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa9280240  7100069f            cmp w20, #0x1 (1)
0x0000ffffa9280244  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa9280248  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa928024c  d3471ef4            ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffffa9280250  d3461af5            ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffffa9280254  ca150294            eor x20, x20, x21
0x0000ffffa9280258  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa928025c  58000040            ldr x0, pc+8 (addr 0xffffa9280264)
0x0000ffffa9280260  d63f0000            blr x0
0x0000ffffa9280264  bf062128            unallocated (Unallocated)
0x0000ffffa9280268  0000ffff            udf #0xffff
0x0000ffffa928026c  00010022            unallocated (Unallocated)
0x0000ffffa9280270  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 38
[DEBUG] Blow-up Amt: 38x

After 8-bit:
0x0000ffff9cc801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff9cc801dc)
0x0000ffff9cc801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff9cc801e8  d3401cb4    uxtb x20, w5
0x0000ffff9cc801ec  d3401cf5    uxtb x21, w7
0x0000ffff9cc801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff9cc801f4  12001294    and w20, w20, #0x1f
0x0000ffff9cc801f8  b37802d5    bfi x21, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffff9cc801fc  b37722b5    bfi x21, x21, FEX-Emu#9, FEX-Emu#9
0x0000ffff9cc80200  b36e46b5    bfi x21, x21, FEX-Emu#18, FEX-Emu#18
0x0000ffff9cc80204  b3778eb5    bfi x21, x21, FEX-Emu#9, FEX-Emu#36
0x0000ffff9cc80208  9ad426b7    lsr x23, x21, x20
0x0000ffff9cc8020c  b3401ee7    bfxil x7, x23, #0, FEX-Emu#8
0x0000ffff9cc80210  51000698    sub w24, w20, #0x1 (1)
0x0000ffff9cc80214  9ad826b5    lsr x21, x21, x24
0x0000ffff9cc80218  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff9cc8021c  7100069f    cmp w20, #0x1 (1)
0x0000ffff9cc80220  9a9622b4    csel x20, x21, x22, hs
0x0000ffff9cc80224  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff9cc80228  d3471ef4    ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffff9cc8022c  d3461af5    ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffff9cc80230  ca150294    eor x20, x20, x21
0x0000ffff9cc80234  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff9cc80238  58000040    ldr x0, pc+8 (addr 0xffff9cc80240)
0x0000ffff9cc8023c  d63f0000    blr x0
0x0000ffff9cc80240  b2a75128    unallocated (Unallocated)
0x0000ffff9cc80244  0000ffff    udf #0xffff
0x0000ffff9cc80248  00010022    unallocated (Unallocated)
0x0000ffff9cc8024c  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 29
[DEBUG] Blow-up Amt: 29x
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 41,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov w0, v17.s[0]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[1]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[2]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[3]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov w0, v2.s[0]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[1]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[2]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[3]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 41,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov w0, v17.s[0]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[1]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[2]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v17.s[3]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov w0, v2.s[0]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[1]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[2]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov w0, v2.s[3]",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants