-
Notifications
You must be signed in to change notification settings - Fork 128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes CMake configuration on AArch64 host #16
Merged
Sonicadvance1
merged 1 commit into
FEX-Emu:master
from
Sonicadvance1:fixes_cmake_aarch64
Mar 13, 2020
Merged
Fixes CMake configuration on AArch64 host #16
Sonicadvance1
merged 1 commit into
FEX-Emu:master
from
Sonicadvance1:fixes_cmake_aarch64
Mar 13, 2020
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
phire
approved these changes
Mar 13, 2020
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 7, 2023
Only return the particular flags that are being requested in the moment since compacting them all when requested is fairly slow. x87 fcmov in particular was requesting all the flags when it only needs a couple. This reduces a `fcmovb` instruction count blowup from 103x to 48x. Still more room to go but this one stood out as being particularly bad. Old: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b0397 ldrb w23, [x28, FEX-Emu#704] 0x0000000265a002d4 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002d8 aa1702d6 orr x22, x22, x23 0x0000000265a002dc 394b0b97 ldrb w23, [x28, FEX-Emu#706] 0x0000000265a002e0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002e4 531e76f7 lsl w23, w23, FEX-Emu#2 0x0000000265a002e8 aa1702d6 orr x22, x22, x23 0x0000000265a002ec 394b1397 ldrb w23, [x28, FEX-Emu#708] 0x0000000265a002f0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002f4 531c6ef7 lsl w23, w23, FEX-Emu#4 0x0000000265a002f8 aa1702d6 orr x22, x22, x23 0x0000000265a002fc 394b1b97 ldrb w23, [x28, FEX-Emu#710] 0x0000000265a00300 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00304 531a66f7 lsl w23, w23, FEX-Emu#6 0x0000000265a00308 aa1702d6 orr x22, x22, x23 0x0000000265a0030c 394b1f97 ldrb w23, [x28, FEX-Emu#711] 0x0000000265a00310 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00314 531962f7 lsl w23, w23, FEX-Emu#7 0x0000000265a00318 aa1702d6 orr x22, x22, x23 0x0000000265a0031c 394b2397 ldrb w23, [x28, FEX-Emu#712] 0x0000000265a00320 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00324 53185ef7 lsl w23, w23, FEX-Emu#8 0x0000000265a00328 aa1702d6 orr x22, x22, x23 0x0000000265a0032c 394b2797 ldrb w23, [x28, FEX-Emu#713] 0x0000000265a00330 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00334 53175af7 lsl w23, w23, FEX-Emu#9 0x0000000265a00338 aa1702d6 orr x22, x22, x23 0x0000000265a0033c 394b2b97 ldrb w23, [x28, FEX-Emu#714] 0x0000000265a00340 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00344 531656f7 lsl w23, w23, FEX-Emu#10 0x0000000265a00348 aa1702d6 orr x22, x22, x23 0x0000000265a0034c 394b2f97 ldrb w23, [x28, FEX-Emu#715] 0x0000000265a00350 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00354 531552f7 lsl w23, w23, FEX-Emu#11 0x0000000265a00358 aa1702d6 orr x22, x22, x23 0x0000000265a0035c 394b3397 ldrb w23, [x28, FEX-Emu#716] 0x0000000265a00360 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00364 53144ef7 lsl w23, w23, FEX-Emu#12 0x0000000265a00368 aa1702d6 orr x22, x22, x23 0x0000000265a0036c 394b3b97 ldrb w23, [x28, FEX-Emu#718] 0x0000000265a00370 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00374 531246f7 lsl w23, w23, FEX-Emu#14 0x0000000265a00378 aa1702d6 orr x22, x22, x23 0x0000000265a0037c 394b4397 ldrb w23, [x28, FEX-Emu#720] 0x0000000265a00380 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00384 53103ef7 lsl w23, w23, FEX-Emu#16 0x0000000265a00388 aa1702d6 orr x22, x22, x23 0x0000000265a0038c 394b4797 ldrb w23, [x28, FEX-Emu#721] 0x0000000265a00390 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00394 530f3af7 lsl w23, w23, FEX-Emu#17 0x0000000265a00398 aa1702d6 orr x22, x22, x23 0x0000000265a0039c 394b4b97 ldrb w23, [x28, FEX-Emu#722] 0x0000000265a003a0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003a4 530e36f7 lsl w23, w23, FEX-Emu#18 0x0000000265a003a8 aa1702d6 orr x22, x22, x23 0x0000000265a003ac 394b4f97 ldrb w23, [x28, FEX-Emu#723] 0x0000000265a003b0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003b4 530d32f7 lsl w23, w23, FEX-Emu#19 0x0000000265a003b8 aa1702d6 orr x22, x22, x23 0x0000000265a003bc 394b5397 ldrb w23, [x28, FEX-Emu#724] 0x0000000265a003c0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003c4 530c2ef7 lsl w23, w23, FEX-Emu#20 0x0000000265a003c8 aa1702d6 orr x22, x22, x23 0x0000000265a003cc 394b5797 ldrb w23, [x28, FEX-Emu#725] 0x0000000265a003d0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003d4 530b2af7 lsl w23, w23, FEX-Emu#21 0x0000000265a003d8 aa1702d6 orr x22, x22, x23 0x0000000265a003dc 924002d6 and x22, x22, #0x1 0x0000000265a003e0 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a003e4 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a003e8 f10002df cmp x22, #0x0 (0) 0x0000000265a003ec 9a950294 csel x20, x20, x21, eq 0x0000000265a003f0 4e080e84 dup v4.2d, x20 0x0000000265a003f4 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a003f8 91000695 add x21, x20, #0x1 (1) 0x0000000265a003fc 92400ab5 and x21, x21, #0x7 0x0000000265a00400 d2800200 mov x0, #0x10 0x0000000265a00404 9b007e80 mul x0, x20, x0 0x0000000265a00408 8b000380 add x0, x28, x0 0x0000000265a0040c 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a00410 d2800200 mov x0, #0x10 0x0000000265a00414 9b007ea0 mul x0, x21, x0 0x0000000265a00418 8b000380 add x0, x28, x0 0x0000000265a0041c 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a00420 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00424 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a00428 4ea01c04 mov v4.16b, v0.16b 0x0000000265a0042c d2800200 mov x0, #0x10 0x0000000265a00430 9b007e80 mul x0, x20, x0 0x0000000265a00434 8b000380 add x0, x28, x0 0x0000000265a00438 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a0043c 58000040 ldr x0, pc+8 (addr 0x265a00444) 0x0000000265a00440 d63f0000 blr x0 ``` New: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b1f97 ldrb w23, [x28, FEX-Emu#711] 0x0000000265a002d4 331902f6 bfi w22, w23, FEX-Emu#7, FEX-Emu#1 0x0000000265a002d8 394b2797 ldrb w23, [x28, FEX-Emu#713] 0x0000000265a002dc 331702f6 bfi w22, w23, FEX-Emu#9, FEX-Emu#1 0x0000000265a002e0 394b2f97 ldrb w23, [x28, FEX-Emu#715] 0x0000000265a002e4 331502f6 bfi w22, w23, FEX-Emu#11, FEX-Emu#1 0x0000000265a002e8 394b4797 ldrb w23, [x28, FEX-Emu#721] 0x0000000265a002ec 330f02f6 bfi w22, w23, FEX-Emu#17, FEX-Emu#1 0x0000000265a002f0 394b4f97 ldrb w23, [x28, FEX-Emu#723] 0x0000000265a002f4 330d02f6 bfi w22, w23, FEX-Emu#19, FEX-Emu#1 0x0000000265a002f8 394b5797 ldrb w23, [x28, FEX-Emu#725] 0x0000000265a002fc 330b02f6 bfi w22, w23, FEX-Emu#21, FEX-Emu#1 0x0000000265a00300 924002d6 and x22, x22, #0x1 0x0000000265a00304 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a00308 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a0030c f10002df cmp x22, #0x0 (0) 0x0000000265a00310 9a950294 csel x20, x20, x21, eq 0x0000000265a00314 4e080e84 dup v4.2d, x20 0x0000000265a00318 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a0031c 91000695 add x21, x20, #0x1 (1) 0x0000000265a00320 92400ab5 and x21, x21, #0x7 0x0000000265a00324 d2800200 mov x0, #0x10 0x0000000265a00328 9b007e80 mul x0, x20, x0 0x0000000265a0032c 8b000380 add x0, x28, x0 0x0000000265a00330 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a00334 d2800200 mov x0, #0x10 0x0000000265a00338 9b007ea0 mul x0, x21, x0 0x0000000265a0033c 8b000380 add x0, x28, x0 0x0000000265a00340 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a00344 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00348 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a0034c 4ea01c04 mov v4.16b, v0.16b 0x0000000265a00350 d2800200 mov x0, #0x10 0x0000000265a00354 9b007e80 mul x0, x20, x0 0x0000000265a00358 8b000380 add x0, x28, x0 0x0000000265a0035c 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a00360 58000040 ldr x0, pc+8 (addr 0x265a00368) 0x0000000265a00364 d63f0000 blr x0 ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 7, 2023
Only return the particular flags that are being requested in the moment since compacting them all when requested is fairly slow. x87 fcmov in particular was requesting all the flags when it only needs a couple. This reduces a `fcmovb` instruction count blowup from 103x to 38x. Still more room to go but this one stood out as being particularly bad. Old: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b0397 ldrb w23, [x28, FEX-Emu#704] 0x0000000265a002d4 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002d8 aa1702d6 orr x22, x22, x23 0x0000000265a002dc 394b0b97 ldrb w23, [x28, FEX-Emu#706] 0x0000000265a002e0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002e4 531e76f7 lsl w23, w23, FEX-Emu#2 0x0000000265a002e8 aa1702d6 orr x22, x22, x23 0x0000000265a002ec 394b1397 ldrb w23, [x28, FEX-Emu#708] 0x0000000265a002f0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002f4 531c6ef7 lsl w23, w23, FEX-Emu#4 0x0000000265a002f8 aa1702d6 orr x22, x22, x23 0x0000000265a002fc 394b1b97 ldrb w23, [x28, FEX-Emu#710] 0x0000000265a00300 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00304 531a66f7 lsl w23, w23, FEX-Emu#6 0x0000000265a00308 aa1702d6 orr x22, x22, x23 0x0000000265a0030c 394b1f97 ldrb w23, [x28, FEX-Emu#711] 0x0000000265a00310 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00314 531962f7 lsl w23, w23, FEX-Emu#7 0x0000000265a00318 aa1702d6 orr x22, x22, x23 0x0000000265a0031c 394b2397 ldrb w23, [x28, FEX-Emu#712] 0x0000000265a00320 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00324 53185ef7 lsl w23, w23, FEX-Emu#8 0x0000000265a00328 aa1702d6 orr x22, x22, x23 0x0000000265a0032c 394b2797 ldrb w23, [x28, FEX-Emu#713] 0x0000000265a00330 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00334 53175af7 lsl w23, w23, FEX-Emu#9 0x0000000265a00338 aa1702d6 orr x22, x22, x23 0x0000000265a0033c 394b2b97 ldrb w23, [x28, FEX-Emu#714] 0x0000000265a00340 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00344 531656f7 lsl w23, w23, FEX-Emu#10 0x0000000265a00348 aa1702d6 orr x22, x22, x23 0x0000000265a0034c 394b2f97 ldrb w23, [x28, FEX-Emu#715] 0x0000000265a00350 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00354 531552f7 lsl w23, w23, FEX-Emu#11 0x0000000265a00358 aa1702d6 orr x22, x22, x23 0x0000000265a0035c 394b3397 ldrb w23, [x28, FEX-Emu#716] 0x0000000265a00360 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00364 53144ef7 lsl w23, w23, FEX-Emu#12 0x0000000265a00368 aa1702d6 orr x22, x22, x23 0x0000000265a0036c 394b3b97 ldrb w23, [x28, FEX-Emu#718] 0x0000000265a00370 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00374 531246f7 lsl w23, w23, FEX-Emu#14 0x0000000265a00378 aa1702d6 orr x22, x22, x23 0x0000000265a0037c 394b4397 ldrb w23, [x28, FEX-Emu#720] 0x0000000265a00380 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00384 53103ef7 lsl w23, w23, FEX-Emu#16 0x0000000265a00388 aa1702d6 orr x22, x22, x23 0x0000000265a0038c 394b4797 ldrb w23, [x28, FEX-Emu#721] 0x0000000265a00390 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00394 530f3af7 lsl w23, w23, FEX-Emu#17 0x0000000265a00398 aa1702d6 orr x22, x22, x23 0x0000000265a0039c 394b4b97 ldrb w23, [x28, FEX-Emu#722] 0x0000000265a003a0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003a4 530e36f7 lsl w23, w23, FEX-Emu#18 0x0000000265a003a8 aa1702d6 orr x22, x22, x23 0x0000000265a003ac 394b4f97 ldrb w23, [x28, FEX-Emu#723] 0x0000000265a003b0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003b4 530d32f7 lsl w23, w23, FEX-Emu#19 0x0000000265a003b8 aa1702d6 orr x22, x22, x23 0x0000000265a003bc 394b5397 ldrb w23, [x28, FEX-Emu#724] 0x0000000265a003c0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003c4 530c2ef7 lsl w23, w23, FEX-Emu#20 0x0000000265a003c8 aa1702d6 orr x22, x22, x23 0x0000000265a003cc 394b5797 ldrb w23, [x28, FEX-Emu#725] 0x0000000265a003d0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003d4 530b2af7 lsl w23, w23, FEX-Emu#21 0x0000000265a003d8 aa1702d6 orr x22, x22, x23 0x0000000265a003dc 924002d6 and x22, x22, #0x1 0x0000000265a003e0 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a003e4 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a003e8 f10002df cmp x22, #0x0 (0) 0x0000000265a003ec 9a950294 csel x20, x20, x21, eq 0x0000000265a003f0 4e080e84 dup v4.2d, x20 0x0000000265a003f4 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a003f8 91000695 add x21, x20, #0x1 (1) 0x0000000265a003fc 92400ab5 and x21, x21, #0x7 0x0000000265a00400 d2800200 mov x0, #0x10 0x0000000265a00404 9b007e80 mul x0, x20, x0 0x0000000265a00408 8b000380 add x0, x28, x0 0x0000000265a0040c 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a00410 d2800200 mov x0, #0x10 0x0000000265a00414 9b007ea0 mul x0, x21, x0 0x0000000265a00418 8b000380 add x0, x28, x0 0x0000000265a0041c 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a00420 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00424 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a00428 4ea01c04 mov v4.16b, v0.16b 0x0000000265a0042c d2800200 mov x0, #0x10 0x0000000265a00430 9b007e80 mul x0, x20, x0 0x0000000265a00434 8b000380 add x0, x28, x0 0x0000000265a00438 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a0043c 58000040 ldr x0, pc+8 (addr 0x265a00444) 0x0000000265a00440 d63f0000 blr x0 ``` New: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b0397 ldrb w23, [x28, FEX-Emu#704] 0x0000000265a002d4 330002f6 bfxil w22, w23, #0, FEX-Emu#1 0x0000000265a002d8 924002d6 and x22, x22, #0x1 0x0000000265a002dc 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a002e0 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a002e4 f10002df cmp x22, #0x0 (0) 0x0000000265a002e8 9a950294 csel x20, x20, x21, eq 0x0000000265a002ec 4e080e84 dup v4.2d, x20 0x0000000265a002f0 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a002f4 91000695 add x21, x20, #0x1 (1) 0x0000000265a002f8 92400ab5 and x21, x21, #0x7 0x0000000265a002fc d2800200 mov x0, #0x10 0x0000000265a00300 9b007e80 mul x0, x20, x0 0x0000000265a00304 8b000380 add x0, x28, x0 0x0000000265a00308 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a0030c d2800200 mov x0, #0x10 0x0000000265a00310 9b007ea0 mul x0, x21, x0 0x0000000265a00314 8b000380 add x0, x28, x0 0x0000000265a00318 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a0031c 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00320 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a00324 4ea01c04 mov v4.16b, v0.16b 0x0000000265a00328 d2800200 mov x0, #0x10 0x0000000265a0032c 9b007e80 mul x0, x20, x0 0x0000000265a00330 8b000380 add x0, x28, x0 0x0000000265a00334 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a00338 58000040 ldr x0, pc+8 (addr 0x265a00340) 0x0000000265a0033c d63f0000 blr x0 ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 8, 2023
Only return the particular flags that are being requested in the moment since compacting them all when requested is fairly slow. x87 fcmov in particular was requesting all the flags when it only needs a couple. This reduces a `fcmovb` instruction count blowup from 103x to 38x. Still more room to go but this one stood out as being particularly bad. Old: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b0397 ldrb w23, [x28, FEX-Emu#704] 0x0000000265a002d4 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002d8 aa1702d6 orr x22, x22, x23 0x0000000265a002dc 394b0b97 ldrb w23, [x28, FEX-Emu#706] 0x0000000265a002e0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002e4 531e76f7 lsl w23, w23, FEX-Emu#2 0x0000000265a002e8 aa1702d6 orr x22, x22, x23 0x0000000265a002ec 394b1397 ldrb w23, [x28, FEX-Emu#708] 0x0000000265a002f0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a002f4 531c6ef7 lsl w23, w23, FEX-Emu#4 0x0000000265a002f8 aa1702d6 orr x22, x22, x23 0x0000000265a002fc 394b1b97 ldrb w23, [x28, FEX-Emu#710] 0x0000000265a00300 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00304 531a66f7 lsl w23, w23, FEX-Emu#6 0x0000000265a00308 aa1702d6 orr x22, x22, x23 0x0000000265a0030c 394b1f97 ldrb w23, [x28, FEX-Emu#711] 0x0000000265a00310 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00314 531962f7 lsl w23, w23, FEX-Emu#7 0x0000000265a00318 aa1702d6 orr x22, x22, x23 0x0000000265a0031c 394b2397 ldrb w23, [x28, FEX-Emu#712] 0x0000000265a00320 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00324 53185ef7 lsl w23, w23, FEX-Emu#8 0x0000000265a00328 aa1702d6 orr x22, x22, x23 0x0000000265a0032c 394b2797 ldrb w23, [x28, FEX-Emu#713] 0x0000000265a00330 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00334 53175af7 lsl w23, w23, FEX-Emu#9 0x0000000265a00338 aa1702d6 orr x22, x22, x23 0x0000000265a0033c 394b2b97 ldrb w23, [x28, FEX-Emu#714] 0x0000000265a00340 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00344 531656f7 lsl w23, w23, FEX-Emu#10 0x0000000265a00348 aa1702d6 orr x22, x22, x23 0x0000000265a0034c 394b2f97 ldrb w23, [x28, FEX-Emu#715] 0x0000000265a00350 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00354 531552f7 lsl w23, w23, FEX-Emu#11 0x0000000265a00358 aa1702d6 orr x22, x22, x23 0x0000000265a0035c 394b3397 ldrb w23, [x28, FEX-Emu#716] 0x0000000265a00360 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00364 53144ef7 lsl w23, w23, FEX-Emu#12 0x0000000265a00368 aa1702d6 orr x22, x22, x23 0x0000000265a0036c 394b3b97 ldrb w23, [x28, FEX-Emu#718] 0x0000000265a00370 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00374 531246f7 lsl w23, w23, FEX-Emu#14 0x0000000265a00378 aa1702d6 orr x22, x22, x23 0x0000000265a0037c 394b4397 ldrb w23, [x28, FEX-Emu#720] 0x0000000265a00380 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00384 53103ef7 lsl w23, w23, FEX-Emu#16 0x0000000265a00388 aa1702d6 orr x22, x22, x23 0x0000000265a0038c 394b4797 ldrb w23, [x28, FEX-Emu#721] 0x0000000265a00390 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a00394 530f3af7 lsl w23, w23, FEX-Emu#17 0x0000000265a00398 aa1702d6 orr x22, x22, x23 0x0000000265a0039c 394b4b97 ldrb w23, [x28, FEX-Emu#722] 0x0000000265a003a0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003a4 530e36f7 lsl w23, w23, FEX-Emu#18 0x0000000265a003a8 aa1702d6 orr x22, x22, x23 0x0000000265a003ac 394b4f97 ldrb w23, [x28, FEX-Emu#723] 0x0000000265a003b0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003b4 530d32f7 lsl w23, w23, FEX-Emu#19 0x0000000265a003b8 aa1702d6 orr x22, x22, x23 0x0000000265a003bc 394b5397 ldrb w23, [x28, FEX-Emu#724] 0x0000000265a003c0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003c4 530c2ef7 lsl w23, w23, FEX-Emu#20 0x0000000265a003c8 aa1702d6 orr x22, x22, x23 0x0000000265a003cc 394b5797 ldrb w23, [x28, FEX-Emu#725] 0x0000000265a003d0 d3407ef7 ubfx x23, x23, #0, FEX-Emu#32 0x0000000265a003d4 530b2af7 lsl w23, w23, FEX-Emu#21 0x0000000265a003d8 aa1702d6 orr x22, x22, x23 0x0000000265a003dc 924002d6 and x22, x22, #0x1 0x0000000265a003e0 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a003e4 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a003e8 f10002df cmp x22, #0x0 (0) 0x0000000265a003ec 9a950294 csel x20, x20, x21, eq 0x0000000265a003f0 4e080e84 dup v4.2d, x20 0x0000000265a003f4 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a003f8 91000695 add x21, x20, #0x1 (1) 0x0000000265a003fc 92400ab5 and x21, x21, #0x7 0x0000000265a00400 d2800200 mov x0, #0x10 0x0000000265a00404 9b007e80 mul x0, x20, x0 0x0000000265a00408 8b000380 add x0, x28, x0 0x0000000265a0040c 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a00410 d2800200 mov x0, #0x10 0x0000000265a00414 9b007ea0 mul x0, x21, x0 0x0000000265a00418 8b000380 add x0, x28, x0 0x0000000265a0041c 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a00420 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00424 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a00428 4ea01c04 mov v4.16b, v0.16b 0x0000000265a0042c d2800200 mov x0, #0x10 0x0000000265a00430 9b007e80 mul x0, x20, x0 0x0000000265a00434 8b000380 add x0, x28, x0 0x0000000265a00438 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a0043c 58000040 ldr x0, pc+8 (addr 0x265a00444) 0x0000000265a00440 d63f0000 blr x0 ``` New: ```asm 0x0000000265a002bc 10ffffe0 adr x0, #-0x4 (addr 0x265a002b8) 0x0000000265a002c0 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000000265a002c4 d2800014 mov x20, #0x0 0x0000000265a002c8 d2800035 mov x21, #0x1 0x0000000265a002cc d2800056 mov x22, #0x2 0x0000000265a002d0 394b0397 ldrb w23, [x28, FEX-Emu#704] 0x0000000265a002d4 330002f6 bfxil w22, w23, #0, FEX-Emu#1 0x0000000265a002d8 924002d6 and x22, x22, #0x1 0x0000000265a002dc 93400294 sbfx x20, x20, #0, FEX-Emu#1 0x0000000265a002e0 934002b5 sbfx x21, x21, #0, FEX-Emu#1 0x0000000265a002e4 f10002df cmp x22, #0x0 (0) 0x0000000265a002e8 9a950294 csel x20, x20, x21, eq 0x0000000265a002ec 4e080e84 dup v4.2d, x20 0x0000000265a002f0 394baf94 ldrb w20, [x28, FEX-Emu#747] 0x0000000265a002f4 91000695 add x21, x20, #0x1 (1) 0x0000000265a002f8 92400ab5 and x21, x21, #0x7 0x0000000265a002fc d2800200 mov x0, #0x10 0x0000000265a00300 9b007e80 mul x0, x20, x0 0x0000000265a00304 8b000380 add x0, x28, x0 0x0000000265a00308 3dc0bc05 ldr q5, [x0, FEX-Emu#752] 0x0000000265a0030c d2800200 mov x0, #0x10 0x0000000265a00310 9b007ea0 mul x0, x21, x0 0x0000000265a00314 8b000380 add x0, x28, x0 0x0000000265a00318 3dc0bc06 ldr q6, [x0, FEX-Emu#752] 0x0000000265a0031c 4ea41c80 mov v0.16b, v4.16b 0x0000000265a00320 6e651cc0 bsl v0.16b, v6.16b, v5.16b 0x0000000265a00324 4ea01c04 mov v4.16b, v0.16b 0x0000000265a00328 d2800200 mov x0, #0x10 0x0000000265a0032c 9b007e80 mul x0, x20, x0 0x0000000265a00330 8b000380 add x0, x28, x0 0x0000000265a00334 3d80bc04 str q4, [x0, FEX-Emu#752] 0x0000000265a00338 58000040 ldr x0, pc+8 (addr 0x265a00340) 0x0000000265a0033c d63f0000 blr x0 ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 8, 2023
This previous implementation was particularly gnarly. Because these instructions are both weackly ordered and have implementation dependent exception and trap behaviour these can actually be fairly conveniently converted over to a load + cmlt + bsl + str instruction. For the XMM variant this reduces code blowup from 80x to 15x! For the MMX variant this reduces code blowup from 46x to 17x! Both of these improvements are significant wins! There's still some minor improvement that could be done with bsl that requires some redundant moves, but since we don't have constraint support for this we still eat two additional instructions Before: ```asm 0x0000ffff7b800718 10ffffe0 adr x0, #-0x4 (addr 0xffff7b800714) 0x0000ffff7b80071c f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff7b800720 4eb11e24 mov v4.16b, v17.16b 0x0000ffff7b800724 4eb01e05 mov v5.16b, v16.16b 0x0000ffff7b800728 aa0b03f4 mov x20, x11 0x0000ffff7b80072c 4e083c95 mov x21, v4.d[0] 0x0000ffff7b800730 4e083cb6 mov x22, v5.d[0] 0x0000ffff7b800734 d3471eb7 ubfx x23, x21, FEX-Emu#7, FEX-Emu#1 0x0000ffff7b800738 b4000077 cbz x23, #+0xc (addr 0xffff7b800744) 0x0000ffff7b80073c d3401ed7 uxtb x23, w22 0x0000ffff7b800740 39000297 strb w23, [x20] 0x0000ffff7b800744 d34f3eb7 ubfx x23, x21, FEX-Emu#15, FEX-Emu#1 0x0000ffff7b800748 b4000077 cbz x23, #+0xc (addr 0xffff7b800754) 0x0000ffff7b80074c d3483ed7 ubfx x23, x22, FEX-Emu#8, FEX-Emu#8 0x0000ffff7b800750 39000697 strb w23, [x20, FEX-Emu#1] 0x0000ffff7b800754 d3575eb7 ubfx x23, x21, FEX-Emu#23, FEX-Emu#1 0x0000ffff7b800758 b4000077 cbz x23, #+0xc (addr 0xffff7b800764) 0x0000ffff7b80075c d3505ed7 ubfx x23, x22, FEX-Emu#16, FEX-Emu#8 0x0000ffff7b800760 39000a97 strb w23, [x20, FEX-Emu#2] 0x0000ffff7b800764 d35f7eb7 ubfx x23, x21, FEX-Emu#31, FEX-Emu#1 0x0000ffff7b800768 b4000077 cbz x23, #+0xc (addr 0xffff7b800774) 0x0000ffff7b80076c d3587ed7 ubfx x23, x22, FEX-Emu#24, FEX-Emu#8 0x0000ffff7b800770 39000e97 strb w23, [x20, FEX-Emu#3] 0x0000ffff7b800774 d3679eb7 ubfx x23, x21, FEX-Emu#39, FEX-Emu#1 0x0000ffff7b800778 b4000077 cbz x23, #+0xc (addr 0xffff7b800784) 0x0000ffff7b80077c d3609ed7 ubfx x23, x22, FEX-Emu#32, FEX-Emu#8 0x0000ffff7b800780 39001297 strb w23, [x20, FEX-Emu#4] 0x0000ffff7b800784 d36fbeb7 ubfx x23, x21, FEX-Emu#47, FEX-Emu#1 0x0000ffff7b800788 b4000077 cbz x23, #+0xc (addr 0xffff7b800794) 0x0000ffff7b80078c d368bed7 ubfx x23, x22, FEX-Emu#40, FEX-Emu#8 0x0000ffff7b800790 39001697 strb w23, [x20, FEX-Emu#5] 0x0000ffff7b800794 d377deb7 ubfx x23, x21, FEX-Emu#55, FEX-Emu#1 0x0000ffff7b800798 b4000077 cbz x23, #+0xc (addr 0xffff7b8007a4) 0x0000ffff7b80079c d370ded7 ubfx x23, x22, FEX-Emu#48, FEX-Emu#8 0x0000ffff7b8007a0 39001a97 strb w23, [x20, FEX-Emu#6] 0x0000ffff7b8007a4 d37ffeb5 lsr x21, x21, FEX-Emu#63 0x0000ffff7b8007a8 b4000075 cbz x21, #+0xc (addr 0xffff7b8007b4) 0x0000ffff7b8007ac d378fed5 lsr x21, x22, FEX-Emu#56 0x0000ffff7b8007b0 39001e95 strb w21, [x20, FEX-Emu#7] 0x0000ffff7b8007b4 4e183c95 mov x21, v4.d[1] 0x0000ffff7b8007b8 4e183cb6 mov x22, v5.d[1] 0x0000ffff7b8007bc d3471eb7 ubfx x23, x21, FEX-Emu#7, FEX-Emu#1 0x0000ffff7b8007c0 b4000077 cbz x23, #+0xc (addr 0xffff7b8007cc) 0x0000ffff7b8007c4 d3401ed7 uxtb x23, w22 0x0000ffff7b8007c8 39002297 strb w23, [x20, FEX-Emu#8] 0x0000ffff7b8007cc d34f3eb7 ubfx x23, x21, FEX-Emu#15, FEX-Emu#1 0x0000ffff7b8007d0 b4000077 cbz x23, #+0xc (addr 0xffff7b8007dc) 0x0000ffff7b8007d4 d3483ed7 ubfx x23, x22, FEX-Emu#8, FEX-Emu#8 0x0000ffff7b8007d8 39002697 strb w23, [x20, FEX-Emu#9] 0x0000ffff7b8007dc d3575eb7 ubfx x23, x21, FEX-Emu#23, FEX-Emu#1 0x0000ffff7b8007e0 b4000077 cbz x23, #+0xc (addr 0xffff7b8007ec) 0x0000ffff7b8007e4 d3505ed7 ubfx x23, x22, FEX-Emu#16, FEX-Emu#8 0x0000ffff7b8007e8 39002a97 strb w23, [x20, FEX-Emu#10] 0x0000ffff7b8007ec d35f7eb7 ubfx x23, x21, FEX-Emu#31, FEX-Emu#1 0x0000ffff7b8007f0 b4000077 cbz x23, #+0xc (addr 0xffff7b8007fc) 0x0000ffff7b8007f4 d3587ed7 ubfx x23, x22, FEX-Emu#24, FEX-Emu#8 0x0000ffff7b8007f8 39002e97 strb w23, [x20, FEX-Emu#11] 0x0000ffff7b8007fc d3679eb7 ubfx x23, x21, FEX-Emu#39, FEX-Emu#1 0x0000ffff7b800800 b4000077 cbz x23, #+0xc (addr 0xffff7b80080c) 0x0000ffff7b800804 d3609ed7 ubfx x23, x22, FEX-Emu#32, FEX-Emu#8 0x0000ffff7b800808 39003297 strb w23, [x20, FEX-Emu#12] 0x0000ffff7b80080c d36fbeb7 ubfx x23, x21, FEX-Emu#47, FEX-Emu#1 0x0000ffff7b800810 b4000077 cbz x23, #+0xc (addr 0xffff7b80081c) 0x0000ffff7b800814 d368bed7 ubfx x23, x22, FEX-Emu#40, FEX-Emu#8 0x0000ffff7b800818 39003697 strb w23, [x20, FEX-Emu#13] 0x0000ffff7b80081c d377deb7 ubfx x23, x21, FEX-Emu#55, FEX-Emu#1 0x0000ffff7b800820 b4000077 cbz x23, #+0xc (addr 0xffff7b80082c) 0x0000ffff7b800824 d370ded7 ubfx x23, x22, FEX-Emu#48, FEX-Emu#8 0x0000ffff7b800828 39003a97 strb w23, [x20, FEX-Emu#14] 0x0000ffff7b80082c d37ffeb5 lsr x21, x21, FEX-Emu#63 0x0000ffff7b800830 b4000075 cbz x21, #+0xc (addr 0xffff7b80083c) 0x0000ffff7b800834 d378fed5 lsr x21, x22, FEX-Emu#56 0x0000ffff7b800838 39003e95 strb w21, [x20, FEX-Emu#15] 0x0000ffff7b80083c 58000040 ldr x0, pc+8 (addr 0xffff7b800844) 0x0000ffff7b800840 d63f0000 blr x0 ``` After: ```asm 0x0000ffff7ac00718 10ffffe0 adr x0, #-0x4 (addr 0xffff7ac00714) 0x0000ffff7ac0071c f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff7ac00720 4e20aa24 cmlt v4.16b, v17.16b, #0 0x0000ffff7ac00724 3dc00165 ldr q5, [x11] 0x0000ffff7ac00728 4ea41c80 mov v0.16b, v4.16b 0x0000ffff7ac0072c 6e651e00 bsl v0.16b, v16.16b, v5.16b 0x0000ffff7ac00730 4ea01c04 mov v4.16b, v0.16b 0x0000ffff7ac00734 3d800164 str q4, [x11] 0x0000ffff7ac00738 58000040 ldr x0, pc+8 (addr 0xffff7ac00740) 0x0000ffff7ac0073c d63f0000 blr x0 ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 16, 2023
The BFI cascades in this particular instruction weren't optimal. Biggest improvement is the 8-bit version, while the 16-bit version gets a minor improvement. 8-bit instruction count reduced from 38 to 29. 16-bit instruction count reduced from 34 to 28. RCL can have a similar optimization done to it. ```asm Before 16-bit: 0x0000ffff80a801e0 10ffffe0 adr x0, #-0x4 (addr 0xffff80a801dc) 0x0000ffff80a801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff80a801e8 d3403cb4 uxth x20, w5 0x0000ffff80a801ec d3403cf5 uxth x21, w7 0x0000ffff80a801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffff80a801f4 12001294 and w20, w20, #0x1f 0x0000ffff80a801f8 d2800017 mov x23, #0x0 0x0000ffff80a801fc b3403eb7 bfxil x23, x21, #0, FEX-Emu#16 0x0000ffff80a80200 b37002d7 bfi x23, x22, FEX-Emu#16, FEX-Emu#1 0x0000ffff80a80204 b36f3eb7 bfi x23, x21, FEX-Emu#17, FEX-Emu#16 0x0000ffff80a80208 b35f02d7 bfi x23, x22, FEX-Emu#33, FEX-Emu#1 0x0000ffff80a8020c aa1703e0 mov x0, x23 0x0000ffff80a80210 b35e3ea0 bfi x0, x21, FEX-Emu#34, FEX-Emu#16 0x0000ffff80a80214 aa0003f5 mov x21, x0 0x0000ffff80a80218 b34e02d5 bfi x21, x22, FEX-Emu#50, FEX-Emu#1 0x0000ffff80a8021c 9ad426b7 lsr x23, x21, x20 0x0000ffff80a80220 b3403ee7 bfxil x7, x23, #0, FEX-Emu#16 0x0000ffff80a80224 51000698 sub w24, w20, #0x1 (1) 0x0000ffff80a80228 9ad826b5 lsr x21, x21, x24 0x0000ffff80a8022c d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffff80a80230 7100069f cmp w20, #0x1 (1) 0x0000ffff80a80234 9a9622b4 csel x20, x21, x22, hs 0x0000ffff80a80238 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffff80a8023c d34f3ef4 ubfx x20, x23, FEX-Emu#15, FEX-Emu#1 0x0000ffff80a80240 d34e3af5 ubfx x21, x23, FEX-Emu#14, FEX-Emu#1 0x0000ffff80a80244 ca150294 eor x20, x20, x21 0x0000ffff80a80248 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffff80a8024c 58000040 ldr x0, pc+8 (addr 0xffff80a80254) 0x0000ffff80a80250 d63f0000 blr x0 0x0000ffff80a80254 967da128 bl #-0x6097b60 (addr 0xffff7a9e86f4) 0x0000ffff80a80258 0000ffff udf #0xffff 0x0000ffff80a8025c 00010023 unallocated (Unallocated) 0x0000ffff80a80260 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 34 [DEBUG] Blow-up Amt: 34x After 16-bit: 0x0000ffffa7c801e0 10ffffe0 adr x0, #-0x4 (addr 0xffffa7c801dc) 0x0000ffffa7c801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffffa7c801e8 d3403cb4 uxth x20, w5 0x0000ffffa7c801ec d3403cf5 uxth x21, w7 0x0000ffffa7c801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffffa7c801f4 12001294 and w20, w20, #0x1f 0x0000ffffa7c801f8 b37002d5 bfi x21, x22, FEX-Emu#16, FEX-Emu#1 0x0000ffffa7c801fc b36f42b5 bfi x21, x21, FEX-Emu#17, FEX-Emu#17 0x0000ffffa7c80200 b35e42b5 bfi x21, x21, FEX-Emu#34, FEX-Emu#17 0x0000ffffa7c80204 9ad426b7 lsr x23, x21, x20 0x0000ffffa7c80208 b3403ee7 bfxil x7, x23, #0, FEX-Emu#16 0x0000ffffa7c8020c 51000698 sub w24, w20, #0x1 (1) 0x0000ffffa7c80210 9ad826b5 lsr x21, x21, x24 0x0000ffffa7c80214 d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffffa7c80218 7100069f cmp w20, #0x1 (1) 0x0000ffffa7c8021c 9a9622b4 csel x20, x21, x22, hs 0x0000ffffa7c80220 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffffa7c80224 d34f3ef4 ubfx x20, x23, FEX-Emu#15, FEX-Emu#1 0x0000ffffa7c80228 d34e3af5 ubfx x21, x23, FEX-Emu#14, FEX-Emu#1 0x0000ffffa7c8022c ca150294 eor x20, x20, x21 0x0000ffffa7c80230 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffffa7c80234 58000040 ldr x0, pc+8 (addr 0xffffa7c8023c) 0x0000ffffa7c80238 d63f0000 blr x0 0x0000ffffa7c8023c bd9cc128 unallocated (Unallocated) 0x0000ffffa7c80240 0000ffff udf #0xffff 0x0000ffffa7c80244 00010023 unallocated (Unallocated) 0x0000ffffa7c80248 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 28 [DEBUG] Blow-up Amt: 28x Before 8-bit: 0x0000ffffa92801e0 10ffffe0 adr x0, #-0x4 (addr 0xffffa92801dc) 0x0000ffffa92801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffffa92801e8 d3401cb4 uxtb x20, w5 0x0000ffffa92801ec d3401cf5 uxtb x21, w7 0x0000ffffa92801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffffa92801f4 12001294 and w20, w20, #0x1f 0x0000ffffa92801f8 d2800017 mov x23, #0x0 0x0000ffffa92801fc b3401eb7 bfxil x23, x21, #0, FEX-Emu#8 0x0000ffffa9280200 b37802d7 bfi x23, x22, FEX-Emu#8, FEX-Emu#1 0x0000ffffa9280204 b3771eb7 bfi x23, x21, FEX-Emu#9, FEX-Emu#8 0x0000ffffa9280208 b36f02d7 bfi x23, x22, FEX-Emu#17, FEX-Emu#1 0x0000ffffa928020c b36e1eb7 bfi x23, x21, FEX-Emu#18, FEX-Emu#8 0x0000ffffa9280210 b36602d7 bfi x23, x22, FEX-Emu#26, FEX-Emu#1 0x0000ffffa9280214 b3651eb7 bfi x23, x21, FEX-Emu#27, FEX-Emu#8 0x0000ffffa9280218 b35d02d7 bfi x23, x22, FEX-Emu#35, FEX-Emu#1 0x0000ffffa928021c aa1703e0 mov x0, x23 0x0000ffffa9280220 b35c1ea0 bfi x0, x21, FEX-Emu#36, FEX-Emu#8 0x0000ffffa9280224 aa0003f5 mov x21, x0 0x0000ffffa9280228 b35402d5 bfi x21, x22, FEX-Emu#44, FEX-Emu#1 0x0000ffffa928022c 9ad426b7 lsr x23, x21, x20 0x0000ffffa9280230 b3401ee7 bfxil x7, x23, #0, FEX-Emu#8 0x0000ffffa9280234 51000698 sub w24, w20, #0x1 (1) 0x0000ffffa9280238 9ad826b5 lsr x21, x21, x24 0x0000ffffa928023c d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffffa9280240 7100069f cmp w20, #0x1 (1) 0x0000ffffa9280244 9a9622b4 csel x20, x21, x22, hs 0x0000ffffa9280248 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffffa928024c d3471ef4 ubfx x20, x23, FEX-Emu#7, FEX-Emu#1 0x0000ffffa9280250 d3461af5 ubfx x21, x23, FEX-Emu#6, FEX-Emu#1 0x0000ffffa9280254 ca150294 eor x20, x20, x21 0x0000ffffa9280258 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffffa928025c 58000040 ldr x0, pc+8 (addr 0xffffa9280264) 0x0000ffffa9280260 d63f0000 blr x0 0x0000ffffa9280264 bf062128 unallocated (Unallocated) 0x0000ffffa9280268 0000ffff udf #0xffff 0x0000ffffa928026c 00010022 unallocated (Unallocated) 0x0000ffffa9280270 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 38 [DEBUG] Blow-up Amt: 38x After 8-bit: 0x0000ffff9cc801e0 10ffffe0 adr x0, #-0x4 (addr 0xffff9cc801dc) 0x0000ffff9cc801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff9cc801e8 d3401cb4 uxtb x20, w5 0x0000ffff9cc801ec d3401cf5 uxtb x21, w7 0x0000ffff9cc801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffff9cc801f4 12001294 and w20, w20, #0x1f 0x0000ffff9cc801f8 b37802d5 bfi x21, x22, FEX-Emu#8, FEX-Emu#1 0x0000ffff9cc801fc b37722b5 bfi x21, x21, FEX-Emu#9, FEX-Emu#9 0x0000ffff9cc80200 b36e46b5 bfi x21, x21, FEX-Emu#18, FEX-Emu#18 0x0000ffff9cc80204 b3778eb5 bfi x21, x21, FEX-Emu#9, FEX-Emu#36 0x0000ffff9cc80208 9ad426b7 lsr x23, x21, x20 0x0000ffff9cc8020c b3401ee7 bfxil x7, x23, #0, FEX-Emu#8 0x0000ffff9cc80210 51000698 sub w24, w20, #0x1 (1) 0x0000ffff9cc80214 9ad826b5 lsr x21, x21, x24 0x0000ffff9cc80218 d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffff9cc8021c 7100069f cmp w20, #0x1 (1) 0x0000ffff9cc80220 9a9622b4 csel x20, x21, x22, hs 0x0000ffff9cc80224 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffff9cc80228 d3471ef4 ubfx x20, x23, FEX-Emu#7, FEX-Emu#1 0x0000ffff9cc8022c d3461af5 ubfx x21, x23, FEX-Emu#6, FEX-Emu#1 0x0000ffff9cc80230 ca150294 eor x20, x20, x21 0x0000ffff9cc80234 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffff9cc80238 58000040 ldr x0, pc+8 (addr 0xffff9cc80240) 0x0000ffff9cc8023c d63f0000 blr x0 0x0000ffff9cc80240 b2a75128 unallocated (Unallocated) 0x0000ffff9cc80244 0000ffff udf #0xffff 0x0000ffff9cc80248 00010022 unallocated (Unallocated) 0x0000ffff9cc8024c 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 29 [DEBUG] Blow-up Amt: 29x ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jul 18, 2023
The BFI cascades in this particular instruction weren't optimal. Biggest improvement is the 8-bit version, while the 16-bit version gets a minor improvement. 8-bit instruction count reduced from 38 to 29. 16-bit instruction count reduced from 34 to 28. RCL can have a similar optimization done to it. ```asm Before 16-bit: 0x0000ffff80a801e0 10ffffe0 adr x0, #-0x4 (addr 0xffff80a801dc) 0x0000ffff80a801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff80a801e8 d3403cb4 uxth x20, w5 0x0000ffff80a801ec d3403cf5 uxth x21, w7 0x0000ffff80a801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffff80a801f4 12001294 and w20, w20, #0x1f 0x0000ffff80a801f8 d2800017 mov x23, #0x0 0x0000ffff80a801fc b3403eb7 bfxil x23, x21, #0, FEX-Emu#16 0x0000ffff80a80200 b37002d7 bfi x23, x22, FEX-Emu#16, FEX-Emu#1 0x0000ffff80a80204 b36f3eb7 bfi x23, x21, FEX-Emu#17, FEX-Emu#16 0x0000ffff80a80208 b35f02d7 bfi x23, x22, FEX-Emu#33, FEX-Emu#1 0x0000ffff80a8020c aa1703e0 mov x0, x23 0x0000ffff80a80210 b35e3ea0 bfi x0, x21, FEX-Emu#34, FEX-Emu#16 0x0000ffff80a80214 aa0003f5 mov x21, x0 0x0000ffff80a80218 b34e02d5 bfi x21, x22, FEX-Emu#50, FEX-Emu#1 0x0000ffff80a8021c 9ad426b7 lsr x23, x21, x20 0x0000ffff80a80220 b3403ee7 bfxil x7, x23, #0, FEX-Emu#16 0x0000ffff80a80224 51000698 sub w24, w20, #0x1 (1) 0x0000ffff80a80228 9ad826b5 lsr x21, x21, x24 0x0000ffff80a8022c d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffff80a80230 7100069f cmp w20, #0x1 (1) 0x0000ffff80a80234 9a9622b4 csel x20, x21, x22, hs 0x0000ffff80a80238 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffff80a8023c d34f3ef4 ubfx x20, x23, FEX-Emu#15, FEX-Emu#1 0x0000ffff80a80240 d34e3af5 ubfx x21, x23, FEX-Emu#14, FEX-Emu#1 0x0000ffff80a80244 ca150294 eor x20, x20, x21 0x0000ffff80a80248 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffff80a8024c 58000040 ldr x0, pc+8 (addr 0xffff80a80254) 0x0000ffff80a80250 d63f0000 blr x0 0x0000ffff80a80254 967da128 bl #-0x6097b60 (addr 0xffff7a9e86f4) 0x0000ffff80a80258 0000ffff udf #0xffff 0x0000ffff80a8025c 00010023 unallocated (Unallocated) 0x0000ffff80a80260 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 34 [DEBUG] Blow-up Amt: 34x After 16-bit: 0x0000ffffa7c801e0 10ffffe0 adr x0, #-0x4 (addr 0xffffa7c801dc) 0x0000ffffa7c801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffffa7c801e8 d3403cb4 uxth x20, w5 0x0000ffffa7c801ec d3403cf5 uxth x21, w7 0x0000ffffa7c801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffffa7c801f4 12001294 and w20, w20, #0x1f 0x0000ffffa7c801f8 b37002d5 bfi x21, x22, FEX-Emu#16, FEX-Emu#1 0x0000ffffa7c801fc b36f42b5 bfi x21, x21, FEX-Emu#17, FEX-Emu#17 0x0000ffffa7c80200 b35e42b5 bfi x21, x21, FEX-Emu#34, FEX-Emu#17 0x0000ffffa7c80204 9ad426b7 lsr x23, x21, x20 0x0000ffffa7c80208 b3403ee7 bfxil x7, x23, #0, FEX-Emu#16 0x0000ffffa7c8020c 51000698 sub w24, w20, #0x1 (1) 0x0000ffffa7c80210 9ad826b5 lsr x21, x21, x24 0x0000ffffa7c80214 d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffffa7c80218 7100069f cmp w20, #0x1 (1) 0x0000ffffa7c8021c 9a9622b4 csel x20, x21, x22, hs 0x0000ffffa7c80220 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffffa7c80224 d34f3ef4 ubfx x20, x23, FEX-Emu#15, FEX-Emu#1 0x0000ffffa7c80228 d34e3af5 ubfx x21, x23, FEX-Emu#14, FEX-Emu#1 0x0000ffffa7c8022c ca150294 eor x20, x20, x21 0x0000ffffa7c80230 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffffa7c80234 58000040 ldr x0, pc+8 (addr 0xffffa7c8023c) 0x0000ffffa7c80238 d63f0000 blr x0 0x0000ffffa7c8023c bd9cc128 unallocated (Unallocated) 0x0000ffffa7c80240 0000ffff udf #0xffff 0x0000ffffa7c80244 00010023 unallocated (Unallocated) 0x0000ffffa7c80248 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 28 [DEBUG] Blow-up Amt: 28x Before 8-bit: 0x0000ffffa92801e0 10ffffe0 adr x0, #-0x4 (addr 0xffffa92801dc) 0x0000ffffa92801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffffa92801e8 d3401cb4 uxtb x20, w5 0x0000ffffa92801ec d3401cf5 uxtb x21, w7 0x0000ffffa92801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffffa92801f4 12001294 and w20, w20, #0x1f 0x0000ffffa92801f8 d2800017 mov x23, #0x0 0x0000ffffa92801fc b3401eb7 bfxil x23, x21, #0, FEX-Emu#8 0x0000ffffa9280200 b37802d7 bfi x23, x22, FEX-Emu#8, FEX-Emu#1 0x0000ffffa9280204 b3771eb7 bfi x23, x21, FEX-Emu#9, FEX-Emu#8 0x0000ffffa9280208 b36f02d7 bfi x23, x22, FEX-Emu#17, FEX-Emu#1 0x0000ffffa928020c b36e1eb7 bfi x23, x21, FEX-Emu#18, FEX-Emu#8 0x0000ffffa9280210 b36602d7 bfi x23, x22, FEX-Emu#26, FEX-Emu#1 0x0000ffffa9280214 b3651eb7 bfi x23, x21, FEX-Emu#27, FEX-Emu#8 0x0000ffffa9280218 b35d02d7 bfi x23, x22, FEX-Emu#35, FEX-Emu#1 0x0000ffffa928021c aa1703e0 mov x0, x23 0x0000ffffa9280220 b35c1ea0 bfi x0, x21, FEX-Emu#36, FEX-Emu#8 0x0000ffffa9280224 aa0003f5 mov x21, x0 0x0000ffffa9280228 b35402d5 bfi x21, x22, FEX-Emu#44, FEX-Emu#1 0x0000ffffa928022c 9ad426b7 lsr x23, x21, x20 0x0000ffffa9280230 b3401ee7 bfxil x7, x23, #0, FEX-Emu#8 0x0000ffffa9280234 51000698 sub w24, w20, #0x1 (1) 0x0000ffffa9280238 9ad826b5 lsr x21, x21, x24 0x0000ffffa928023c d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffffa9280240 7100069f cmp w20, #0x1 (1) 0x0000ffffa9280244 9a9622b4 csel x20, x21, x22, hs 0x0000ffffa9280248 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffffa928024c d3471ef4 ubfx x20, x23, FEX-Emu#7, FEX-Emu#1 0x0000ffffa9280250 d3461af5 ubfx x21, x23, FEX-Emu#6, FEX-Emu#1 0x0000ffffa9280254 ca150294 eor x20, x20, x21 0x0000ffffa9280258 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffffa928025c 58000040 ldr x0, pc+8 (addr 0xffffa9280264) 0x0000ffffa9280260 d63f0000 blr x0 0x0000ffffa9280264 bf062128 unallocated (Unallocated) 0x0000ffffa9280268 0000ffff udf #0xffff 0x0000ffffa928026c 00010022 unallocated (Unallocated) 0x0000ffffa9280270 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 38 [DEBUG] Blow-up Amt: 38x After 8-bit: 0x0000ffff9cc801e0 10ffffe0 adr x0, #-0x4 (addr 0xffff9cc801dc) 0x0000ffff9cc801e4 f9005f80 str x0, [x28, FEX-Emu#184] 0x0000ffff9cc801e8 d3401cb4 uxtb x20, w5 0x0000ffff9cc801ec d3401cf5 uxtb x21, w7 0x0000ffff9cc801f0 394b0396 ldrb w22, [x28, FEX-Emu#704] 0x0000ffff9cc801f4 12001294 and w20, w20, #0x1f 0x0000ffff9cc801f8 b37802d5 bfi x21, x22, FEX-Emu#8, FEX-Emu#1 0x0000ffff9cc801fc b37722b5 bfi x21, x21, FEX-Emu#9, FEX-Emu#9 0x0000ffff9cc80200 b36e46b5 bfi x21, x21, FEX-Emu#18, FEX-Emu#18 0x0000ffff9cc80204 b3778eb5 bfi x21, x21, FEX-Emu#9, FEX-Emu#36 0x0000ffff9cc80208 9ad426b7 lsr x23, x21, x20 0x0000ffff9cc8020c b3401ee7 bfxil x7, x23, #0, FEX-Emu#8 0x0000ffff9cc80210 51000698 sub w24, w20, #0x1 (1) 0x0000ffff9cc80214 9ad826b5 lsr x21, x21, x24 0x0000ffff9cc80218 d34002b5 ubfx x21, x21, #0, FEX-Emu#1 0x0000ffff9cc8021c 7100069f cmp w20, #0x1 (1) 0x0000ffff9cc80220 9a9622b4 csel x20, x21, x22, hs 0x0000ffff9cc80224 390b0394 strb w20, [x28, FEX-Emu#704] 0x0000ffff9cc80228 d3471ef4 ubfx x20, x23, FEX-Emu#7, FEX-Emu#1 0x0000ffff9cc8022c d3461af5 ubfx x21, x23, FEX-Emu#6, FEX-Emu#1 0x0000ffff9cc80230 ca150294 eor x20, x20, x21 0x0000ffff9cc80234 390b2f94 strb w20, [x28, FEX-Emu#715] 0x0000ffff9cc80238 58000040 ldr x0, pc+8 (addr 0xffff9cc80240) 0x0000ffff9cc8023c d63f0000 blr x0 0x0000ffff9cc80240 b2a75128 unallocated (Unallocated) 0x0000ffff9cc80244 0000ffff udf #0xffff 0x0000ffff9cc80248 00010022 unallocated (Unallocated) 0x0000ffff9cc8024c 00000000 udf #0x0 [DEBUG] RIP: 0x10020 [DEBUG] Guest Code instructions: 1 [DEBUG] Host Code instructions: 29 [DEBUG] Blow-up Amt: 29x ```
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 41, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov w0, v17.s[0]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[1]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[2]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[3]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov w0, v2.s[0]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[1]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[2]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[3]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 41, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov w0, v17.s[0]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[1]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[2]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v17.s[3]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov w0, v2.s[0]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[1]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[2]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "mov w0, v2.s[3]", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 37, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov x0, v17.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v17.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov x0, v2.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v2.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 37, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov x0, v17.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v17.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov x0, v2.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v2.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 37, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov x0, v17.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v17.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov x0, v2.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v2.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sonicadvance1
added a commit
to Sonicadvance1/FEX
that referenced
this pull request
Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required. It's pretty gnarly but they aren't often used so that's fine from a compatibility perspective. Example SVE128 implementation: ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 9, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "cmplt p0.s, p6/z, z17.s, #0", "ld1w {z16.s}, p0/z, [x4]", "add x21, x4, #0x10 (16)", "cmplt p0.s, p6/z, z2.s, #0", "ld1w {z2.s}, p0/z, [x21]", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` Example ASIMD implementation ```json "vmaskmovps ymm0, ymm1, [rax]": { "ExpectedInstructionCount": 37, "Comment": [ "Map 2 0b01 0x2c 256-bit" ], "ExpectedArm64ASM": [ "ldr q2, [x28, FEX-Emu#32]", "mrs x20, nzcv", "movi v0.2d, #0x0", "mov x1, x4", "mov x0, v17.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v17.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v16.16b, v0.16b", "add x21, x4, #0x10 (16)", "movi v0.2d, #0x0", "mov x1, x21", "mov x0, v2.d[0]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[0], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[1], [x1]", "add x1, x1, #0x4 (4)", "mov x0, v2.d[1]", "tbz x0, FEX-Emu#63, #+0x8", "ld1 {v0.s}[2], [x1]", "add x1, x1, #0x4 (4)", "tbz w0, FEX-Emu#31, #+0x8", "ld1 {v0.s}[3], [x1]", "mov v2.16b, v0.16b", "str q2, [x28, FEX-Emu#16]", "msr nzcv, x20" ] }, ``` There's a little bit of an improvement where nzcv isn't needed to get touched on the ASIMD implementation, but I'll leave that for a future improvement.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.