Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements Vector_FTOZS op in x86 JIT #63

Merged
merged 1 commit into from
Mar 22, 2020

Conversation

Sonicadvance1
Copy link
Member

No description provided.

@skmp skmp merged commit 7930712 into FEX-Emu:master Mar 22, 2020
@Sonicadvance1 Sonicadvance1 deleted the vector_ftozs branch March 22, 2020 20:10
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jul 8, 2023
This previous implementation was particularly gnarly. Because these
instructions are both weackly ordered and have implementation dependent
exception and trap behaviour these can actually be fairly conveniently
converted over to a load + cmlt + bsl + str instruction.

For the XMM variant this reduces code blowup from 80x to 15x!
For the MMX variant this reduces code blowup from 46x to 17x!

Both of these improvements are significant wins! There's still some
minor improvement that could be done with bsl that requires some
redundant moves, but since we don't have constraint support for this we
still eat two additional instructions

Before:
```asm
0x0000ffff7b800718  10ffffe0    adr x0, #-0x4 (addr 0xffff7b800714)
0x0000ffff7b80071c  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff7b800720  4eb11e24    mov v4.16b, v17.16b
0x0000ffff7b800724  4eb01e05    mov v5.16b, v16.16b
0x0000ffff7b800728  aa0b03f4    mov x20, x11
0x0000ffff7b80072c  4e083c95    mov x21, v4.d[0]
0x0000ffff7b800730  4e083cb6    mov x22, v5.d[0]
0x0000ffff7b800734  d3471eb7    ubfx x23, x21, FEX-Emu#7, FEX-Emu#1
0x0000ffff7b800738  b4000077    cbz x23, #+0xc (addr 0xffff7b800744)
0x0000ffff7b80073c  d3401ed7    uxtb x23, w22
0x0000ffff7b800740  39000297    strb w23, [x20]
0x0000ffff7b800744  d34f3eb7    ubfx x23, x21, FEX-Emu#15, FEX-Emu#1
0x0000ffff7b800748  b4000077    cbz x23, #+0xc (addr 0xffff7b800754)
0x0000ffff7b80074c  d3483ed7    ubfx x23, x22, FEX-Emu#8, FEX-Emu#8
0x0000ffff7b800750  39000697    strb w23, [x20, FEX-Emu#1]
0x0000ffff7b800754  d3575eb7    ubfx x23, x21, FEX-Emu#23, FEX-Emu#1
0x0000ffff7b800758  b4000077    cbz x23, #+0xc (addr 0xffff7b800764)
0x0000ffff7b80075c  d3505ed7    ubfx x23, x22, FEX-Emu#16, FEX-Emu#8
0x0000ffff7b800760  39000a97    strb w23, [x20, FEX-Emu#2]
0x0000ffff7b800764  d35f7eb7    ubfx x23, x21, FEX-Emu#31, FEX-Emu#1
0x0000ffff7b800768  b4000077    cbz x23, #+0xc (addr 0xffff7b800774)
0x0000ffff7b80076c  d3587ed7    ubfx x23, x22, FEX-Emu#24, FEX-Emu#8
0x0000ffff7b800770  39000e97    strb w23, [x20, FEX-Emu#3]
0x0000ffff7b800774  d3679eb7    ubfx x23, x21, FEX-Emu#39, FEX-Emu#1
0x0000ffff7b800778  b4000077    cbz x23, #+0xc (addr 0xffff7b800784)
0x0000ffff7b80077c  d3609ed7    ubfx x23, x22, FEX-Emu#32, FEX-Emu#8
0x0000ffff7b800780  39001297    strb w23, [x20, FEX-Emu#4]
0x0000ffff7b800784  d36fbeb7    ubfx x23, x21, FEX-Emu#47, FEX-Emu#1
0x0000ffff7b800788  b4000077    cbz x23, #+0xc (addr 0xffff7b800794)
0x0000ffff7b80078c  d368bed7    ubfx x23, x22, FEX-Emu#40, FEX-Emu#8
0x0000ffff7b800790  39001697    strb w23, [x20, FEX-Emu#5]
0x0000ffff7b800794  d377deb7    ubfx x23, x21, FEX-Emu#55, FEX-Emu#1
0x0000ffff7b800798  b4000077    cbz x23, #+0xc (addr 0xffff7b8007a4)
0x0000ffff7b80079c  d370ded7    ubfx x23, x22, FEX-Emu#48, FEX-Emu#8
0x0000ffff7b8007a0  39001a97    strb w23, [x20, FEX-Emu#6]
0x0000ffff7b8007a4  d37ffeb5    lsr x21, x21, FEX-Emu#63
0x0000ffff7b8007a8  b4000075    cbz x21, #+0xc (addr 0xffff7b8007b4)
0x0000ffff7b8007ac  d378fed5    lsr x21, x22, FEX-Emu#56
0x0000ffff7b8007b0  39001e95    strb w21, [x20, FEX-Emu#7]
0x0000ffff7b8007b4  4e183c95    mov x21, v4.d[1]
0x0000ffff7b8007b8  4e183cb6    mov x22, v5.d[1]
0x0000ffff7b8007bc  d3471eb7    ubfx x23, x21, FEX-Emu#7, FEX-Emu#1
0x0000ffff7b8007c0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007cc)
0x0000ffff7b8007c4  d3401ed7    uxtb x23, w22
0x0000ffff7b8007c8  39002297    strb w23, [x20, FEX-Emu#8]
0x0000ffff7b8007cc  d34f3eb7    ubfx x23, x21, FEX-Emu#15, FEX-Emu#1
0x0000ffff7b8007d0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007dc)
0x0000ffff7b8007d4  d3483ed7    ubfx x23, x22, FEX-Emu#8, FEX-Emu#8
0x0000ffff7b8007d8  39002697    strb w23, [x20, FEX-Emu#9]
0x0000ffff7b8007dc  d3575eb7    ubfx x23, x21, FEX-Emu#23, FEX-Emu#1
0x0000ffff7b8007e0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007ec)
0x0000ffff7b8007e4  d3505ed7    ubfx x23, x22, FEX-Emu#16, FEX-Emu#8
0x0000ffff7b8007e8  39002a97    strb w23, [x20, FEX-Emu#10]
0x0000ffff7b8007ec  d35f7eb7    ubfx x23, x21, FEX-Emu#31, FEX-Emu#1
0x0000ffff7b8007f0  b4000077    cbz x23, #+0xc (addr 0xffff7b8007fc)
0x0000ffff7b8007f4  d3587ed7    ubfx x23, x22, FEX-Emu#24, FEX-Emu#8
0x0000ffff7b8007f8  39002e97    strb w23, [x20, FEX-Emu#11]
0x0000ffff7b8007fc  d3679eb7    ubfx x23, x21, FEX-Emu#39, FEX-Emu#1
0x0000ffff7b800800  b4000077    cbz x23, #+0xc (addr 0xffff7b80080c)
0x0000ffff7b800804  d3609ed7    ubfx x23, x22, FEX-Emu#32, FEX-Emu#8
0x0000ffff7b800808  39003297    strb w23, [x20, FEX-Emu#12]
0x0000ffff7b80080c  d36fbeb7    ubfx x23, x21, FEX-Emu#47, FEX-Emu#1
0x0000ffff7b800810  b4000077    cbz x23, #+0xc (addr 0xffff7b80081c)
0x0000ffff7b800814  d368bed7    ubfx x23, x22, FEX-Emu#40, FEX-Emu#8
0x0000ffff7b800818  39003697    strb w23, [x20, FEX-Emu#13]
0x0000ffff7b80081c  d377deb7    ubfx x23, x21, FEX-Emu#55, FEX-Emu#1
0x0000ffff7b800820  b4000077    cbz x23, #+0xc (addr 0xffff7b80082c)
0x0000ffff7b800824  d370ded7    ubfx x23, x22, FEX-Emu#48, FEX-Emu#8
0x0000ffff7b800828  39003a97    strb w23, [x20, FEX-Emu#14]
0x0000ffff7b80082c  d37ffeb5    lsr x21, x21, FEX-Emu#63
0x0000ffff7b800830  b4000075    cbz x21, #+0xc (addr 0xffff7b80083c)
0x0000ffff7b800834  d378fed5    lsr x21, x22, FEX-Emu#56
0x0000ffff7b800838  39003e95    strb w21, [x20, FEX-Emu#15]
0x0000ffff7b80083c  58000040    ldr x0, pc+8 (addr 0xffff7b800844)
0x0000ffff7b800840  d63f0000    blr x0
```

After:
```asm
0x0000ffff7ac00718  10ffffe0            adr x0, #-0x4 (addr 0xffff7ac00714)
0x0000ffff7ac0071c  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffff7ac00720  4e20aa24            cmlt v4.16b, v17.16b, #0
0x0000ffff7ac00724  3dc00165            ldr q5, [x11]
0x0000ffff7ac00728  4ea41c80            mov v0.16b, v4.16b
0x0000ffff7ac0072c  6e651e00            bsl v0.16b, v16.16b, v5.16b
0x0000ffff7ac00730  4ea01c04            mov v4.16b, v0.16b
0x0000ffff7ac00734  3d800164            str q4, [x11]
0x0000ffff7ac00738  58000040            ldr x0, pc+8 (addr 0xffff7ac00740)
0x0000ffff7ac0073c  d63f0000            blr x0
```
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
Sonicadvance1 added a commit to Sonicadvance1/FEX that referenced this pull request Jun 21, 2024
In order to support `vmaskmov{ps,pd}` without SVE128 this is required.
It's pretty gnarly but they aren't often used so that's fine from a
compatibility perspective.

Example SVE128 implementation:
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 9,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "cmplt p0.s, p6/z, z17.s, #0",
        "ld1w {z16.s}, p0/z, [x4]",
        "add x21, x4, #0x10 (16)",
        "cmplt p0.s, p6/z, z2.s, #0",
        "ld1w {z2.s}, p0/z, [x21]",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

Example ASIMD implementation
```json
    "vmaskmovps ymm0, ymm1, [rax]": {
      "ExpectedInstructionCount": 37,
      "Comment": [
        "Map 2 0b01 0x2c 256-bit"
      ],
      "ExpectedArm64ASM": [
        "ldr q2, [x28, FEX-Emu#32]",
        "mrs x20, nzcv",
        "movi v0.2d, #0x0",
        "mov x1, x4",
        "mov x0, v17.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v17.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v16.16b, v0.16b",
        "add x21, x4, #0x10 (16)",
        "movi v0.2d, #0x0",
        "mov x1, x21",
        "mov x0, v2.d[0]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[0], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[1], [x1]",
        "add x1, x1, #0x4 (4)",
        "mov x0, v2.d[1]",
        "tbz x0, FEX-Emu#63, #+0x8",
        "ld1 {v0.s}[2], [x1]",
        "add x1, x1, #0x4 (4)",
        "tbz w0, FEX-Emu#31, #+0x8",
        "ld1 {v0.s}[3], [x1]",
        "mov v2.16b, v0.16b",
        "str q2, [x28, FEX-Emu#16]",
        "msr nzcv, x20"
      ]
    },
```

There's a little bit of an improvement where nzcv isn't needed to get
touched on the ASIMD implementation, but I'll leave that for a future
improvement.
alyssarosenzweig added a commit to alyssarosenzweig/FEX that referenced this pull request Aug 10, 2024
generally same # of instructions, but potentially fewer cycles:

old:
        "rmif x4, FEX-Emu#63, #nzCv",
        "cfinv"

new:
        "mvn x20, x4",
        "rmif x20, FEX-Emu#63, #nzCv"

Signed-off-by: Alyssa Rosenzweig <[email protected]>
alyssarosenzweig added a commit to alyssarosenzweig/FEX that referenced this pull request Aug 10, 2024
generally same # of instructions, but potentially fewer cycles:

old:
        "rmif x4, FEX-Emu#63, #nzCv",
        "cfinv"

new:
        "xor x20, x4, FEX-Emu#1",
        "rmif x20, FEX-Emu#63, #nzCv"

Signed-off-by: Alyssa Rosenzweig <[email protected]>
alyssarosenzweig added a commit to alyssarosenzweig/FEX that referenced this pull request Aug 10, 2024
generally same # of instructions, but potentially fewer cycles:

old:
        "rmif x4, FEX-Emu#63, #nzCv",
        "cfinv"

new:
        "xor x20, x4, FEX-Emu#1",
        "rmif x20, FEX-Emu#63, #nzCv"

Signed-off-by: Alyssa Rosenzweig <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants