Skip to content

Commit

Permalink
OpcodeDispatcher: Optimize 8/16-bit RCR
Browse files Browse the repository at this point in the history
The BFI cascades in this particular instruction weren't optimal.
Biggest improvement is the 8-bit version, while the 16-bit version gets
a minor improvement.

8-bit instruction count reduced from 38 to 29.
16-bit instruction count reduced from 34 to 28.

RCL can have a similar optimization done to it.
```asm
Before 16-bit:
0x0000ffff80a801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff80a801dc)
0x0000ffff80a801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff80a801e8  d3403cb4    uxth x20, w5
0x0000ffff80a801ec  d3403cf5    uxth x21, w7
0x0000ffff80a801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff80a801f4  12001294    and w20, w20, #0x1f
0x0000ffff80a801f8  d2800017    mov x23, #0x0
0x0000ffff80a801fc  b3403eb7    bfxil x23, x21, #0, FEX-Emu#16
0x0000ffff80a80200  b37002d7    bfi x23, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffff80a80204  b36f3eb7    bfi x23, x21, FEX-Emu#17, FEX-Emu#16
0x0000ffff80a80208  b35f02d7    bfi x23, x22, FEX-Emu#33, FEX-Emu#1
0x0000ffff80a8020c  aa1703e0    mov x0, x23
0x0000ffff80a80210  b35e3ea0    bfi x0, x21, FEX-Emu#34, FEX-Emu#16
0x0000ffff80a80214  aa0003f5    mov x21, x0
0x0000ffff80a80218  b34e02d5    bfi x21, x22, FEX-Emu#50, FEX-Emu#1
0x0000ffff80a8021c  9ad426b7    lsr x23, x21, x20
0x0000ffff80a80220  b3403ee7    bfxil x7, x23, #0, FEX-Emu#16
0x0000ffff80a80224  51000698    sub w24, w20, #0x1 (1)
0x0000ffff80a80228  9ad826b5    lsr x21, x21, x24
0x0000ffff80a8022c  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff80a80230  7100069f    cmp w20, #0x1 (1)
0x0000ffff80a80234  9a9622b4    csel x20, x21, x22, hs
0x0000ffff80a80238  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff80a8023c  d34f3ef4    ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffff80a80240  d34e3af5    ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffff80a80244  ca150294    eor x20, x20, x21
0x0000ffff80a80248  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff80a8024c  58000040    ldr x0, pc+8 (addr 0xffff80a80254)
0x0000ffff80a80250  d63f0000    blr x0
0x0000ffff80a80254  967da128    bl #-0x6097b60 (addr 0xffff7a9e86f4)
0x0000ffff80a80258  0000ffff    udf #0xffff
0x0000ffff80a8025c  00010023    unallocated (Unallocated)
0x0000ffff80a80260  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 34
[DEBUG] Blow-up Amt: 34x

After 16-bit:
0x0000ffffa7c801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa7c801dc)
0x0000ffffa7c801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa7c801e8  d3403cb4            uxth x20, w5
0x0000ffffa7c801ec  d3403cf5            uxth x21, w7
0x0000ffffa7c801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa7c801f4  12001294            and w20, w20, #0x1f
0x0000ffffa7c801f8  b37002d5            bfi x21, x22, FEX-Emu#16, FEX-Emu#1
0x0000ffffa7c801fc  b36f42b5            bfi x21, x21, FEX-Emu#17, FEX-Emu#17
0x0000ffffa7c80200  b35e42b5            bfi x21, x21, FEX-Emu#34, FEX-Emu#17
0x0000ffffa7c80204  9ad426b7            lsr x23, x21, x20
0x0000ffffa7c80208  b3403ee7            bfxil x7, x23, #0, FEX-Emu#16
0x0000ffffa7c8020c  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa7c80210  9ad826b5            lsr x21, x21, x24
0x0000ffffa7c80214  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa7c80218  7100069f            cmp w20, #0x1 (1)
0x0000ffffa7c8021c  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa7c80220  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa7c80224  d34f3ef4            ubfx x20, x23, FEX-Emu#15, FEX-Emu#1
0x0000ffffa7c80228  d34e3af5            ubfx x21, x23, FEX-Emu#14, FEX-Emu#1
0x0000ffffa7c8022c  ca150294            eor x20, x20, x21
0x0000ffffa7c80230  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa7c80234  58000040            ldr x0, pc+8 (addr 0xffffa7c8023c)
0x0000ffffa7c80238  d63f0000            blr x0
0x0000ffffa7c8023c  bd9cc128            unallocated (Unallocated)
0x0000ffffa7c80240  0000ffff            udf #0xffff
0x0000ffffa7c80244  00010023            unallocated (Unallocated)
0x0000ffffa7c80248  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 28
[DEBUG] Blow-up Amt: 28x

Before 8-bit:
0x0000ffffa92801e0  10ffffe0            adr x0, #-0x4 (addr 0xffffa92801dc)
0x0000ffffa92801e4  f9005f80            str x0, [x28, FEX-Emu#184]
0x0000ffffa92801e8  d3401cb4            uxtb x20, w5
0x0000ffffa92801ec  d3401cf5            uxtb x21, w7
0x0000ffffa92801f0  394b0396            ldrb w22, [x28, FEX-Emu#704]
0x0000ffffa92801f4  12001294            and w20, w20, #0x1f
0x0000ffffa92801f8  d2800017            mov x23, #0x0
0x0000ffffa92801fc  b3401eb7            bfxil x23, x21, #0, FEX-Emu#8
0x0000ffffa9280200  b37802d7            bfi x23, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffffa9280204  b3771eb7            bfi x23, x21, FEX-Emu#9, FEX-Emu#8
0x0000ffffa9280208  b36f02d7            bfi x23, x22, FEX-Emu#17, FEX-Emu#1
0x0000ffffa928020c  b36e1eb7            bfi x23, x21, FEX-Emu#18, FEX-Emu#8
0x0000ffffa9280210  b36602d7            bfi x23, x22, FEX-Emu#26, FEX-Emu#1
0x0000ffffa9280214  b3651eb7            bfi x23, x21, FEX-Emu#27, FEX-Emu#8
0x0000ffffa9280218  b35d02d7            bfi x23, x22, FEX-Emu#35, FEX-Emu#1
0x0000ffffa928021c  aa1703e0            mov x0, x23
0x0000ffffa9280220  b35c1ea0            bfi x0, x21, FEX-Emu#36, FEX-Emu#8
0x0000ffffa9280224  aa0003f5            mov x21, x0
0x0000ffffa9280228  b35402d5            bfi x21, x22, FEX-Emu#44, FEX-Emu#1
0x0000ffffa928022c  9ad426b7            lsr x23, x21, x20
0x0000ffffa9280230  b3401ee7            bfxil x7, x23, #0, FEX-Emu#8
0x0000ffffa9280234  51000698            sub w24, w20, #0x1 (1)
0x0000ffffa9280238  9ad826b5            lsr x21, x21, x24
0x0000ffffa928023c  d34002b5            ubfx x21, x21, #0, FEX-Emu#1
0x0000ffffa9280240  7100069f            cmp w20, #0x1 (1)
0x0000ffffa9280244  9a9622b4            csel x20, x21, x22, hs
0x0000ffffa9280248  390b0394            strb w20, [x28, FEX-Emu#704]
0x0000ffffa928024c  d3471ef4            ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffffa9280250  d3461af5            ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffffa9280254  ca150294            eor x20, x20, x21
0x0000ffffa9280258  390b2f94            strb w20, [x28, FEX-Emu#715]
0x0000ffffa928025c  58000040            ldr x0, pc+8 (addr 0xffffa9280264)
0x0000ffffa9280260  d63f0000            blr x0
0x0000ffffa9280264  bf062128            unallocated (Unallocated)
0x0000ffffa9280268  0000ffff            udf #0xffff
0x0000ffffa928026c  00010022            unallocated (Unallocated)
0x0000ffffa9280270  00000000            udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 38
[DEBUG] Blow-up Amt: 38x

After 8-bit:
0x0000ffff9cc801e0  10ffffe0    adr x0, #-0x4 (addr 0xffff9cc801dc)
0x0000ffff9cc801e4  f9005f80    str x0, [x28, FEX-Emu#184]
0x0000ffff9cc801e8  d3401cb4    uxtb x20, w5
0x0000ffff9cc801ec  d3401cf5    uxtb x21, w7
0x0000ffff9cc801f0  394b0396    ldrb w22, [x28, FEX-Emu#704]
0x0000ffff9cc801f4  12001294    and w20, w20, #0x1f
0x0000ffff9cc801f8  b37802d5    bfi x21, x22, FEX-Emu#8, FEX-Emu#1
0x0000ffff9cc801fc  b37722b5    bfi x21, x21, FEX-Emu#9, FEX-Emu#9
0x0000ffff9cc80200  b36e46b5    bfi x21, x21, FEX-Emu#18, FEX-Emu#18
0x0000ffff9cc80204  b3778eb5    bfi x21, x21, FEX-Emu#9, FEX-Emu#36
0x0000ffff9cc80208  9ad426b7    lsr x23, x21, x20
0x0000ffff9cc8020c  b3401ee7    bfxil x7, x23, #0, FEX-Emu#8
0x0000ffff9cc80210  51000698    sub w24, w20, #0x1 (1)
0x0000ffff9cc80214  9ad826b5    lsr x21, x21, x24
0x0000ffff9cc80218  d34002b5    ubfx x21, x21, #0, FEX-Emu#1
0x0000ffff9cc8021c  7100069f    cmp w20, #0x1 (1)
0x0000ffff9cc80220  9a9622b4    csel x20, x21, x22, hs
0x0000ffff9cc80224  390b0394    strb w20, [x28, FEX-Emu#704]
0x0000ffff9cc80228  d3471ef4    ubfx x20, x23, FEX-Emu#7, FEX-Emu#1
0x0000ffff9cc8022c  d3461af5    ubfx x21, x23, FEX-Emu#6, FEX-Emu#1
0x0000ffff9cc80230  ca150294    eor x20, x20, x21
0x0000ffff9cc80234  390b2f94    strb w20, [x28, FEX-Emu#715]
0x0000ffff9cc80238  58000040    ldr x0, pc+8 (addr 0xffff9cc80240)
0x0000ffff9cc8023c  d63f0000    blr x0
0x0000ffff9cc80240  b2a75128    unallocated (Unallocated)
0x0000ffff9cc80244  0000ffff    udf #0xffff
0x0000ffff9cc80248  00010022    unallocated (Unallocated)
0x0000ffff9cc8024c  00000000    udf #0x0
[DEBUG] RIP: 0x10020
[DEBUG] Guest Code instructions: 1
[DEBUG] Host Code instructions: 29
[DEBUG] Blow-up Amt: 29x
```
  • Loading branch information
Sonicadvance1 committed Jul 18, 2023
1 parent 4a7fa7f commit ed75c19
Showing 1 changed file with 51 additions and 6 deletions.
57 changes: 51 additions & 6 deletions External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2669,17 +2669,62 @@ void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) {
// x86 masks the shift by 0x3F or 0x1F depending on size of op
Src = _And(Src, _Constant(Size, 0x1F));

OrderedNode *Tmp = _Constant(64, 0);
OrderedNode *Tmp{};

// Insert the incoming value across the temporary 64bit source
// Make sure to insert at <BitSize> + 1 offsets
// We need to cover 32bits plus the amount that could rotate in
for (size_t i = 0; i < (32 + Size + 1); i += (Size + 1)) {
// Insert incoming value
Tmp = _Bfi(8, Size, i, Tmp, Dest);

// Insert CF
Tmp = _Bfi(8, 1, i + Size, Tmp, CF);
if (Size == 8) {
// 8-bit optimal cascade
// Cascade: 0
// Data: -> [7:0]
// CF: -> [8:8]
// Cascade: 1
// Data: -> [16:9]
// CF: -> [17:17]
// Cascade: 2
// Data: -> [25:18]
// CF: -> [26:26]
// Cascade: 3
// Data: -> [34:27]
// CF: -> [35:35]
// Cascade: 4
// Data: -> [43:36]
// CF: -> [44:44]

// Insert CF, Destination already at [7:0]
Tmp = _Bfi(8, 1, 8, Dest, CF);

// First Cascade, copies 9 bits from itself.
Tmp = _Bfi(8, 9, 9, Tmp, Tmp);

// Second cascade, copies 18 bits from itself.
Tmp = _Bfi(8, 18, 18, Tmp, Tmp);

// Final cascade, copies 9 bits again from itself.
Tmp = _Bfi(8, 9, 36, Tmp, Tmp);
}
else {
// 16-bit optimal cascade
// Cascade: 0
// Data: -> [15:0]
// CF: -> [16:16]
// Cascade: 1
// Data: -> [32:17]
// CF: -> [33:33]
// Cascade: 2
// Data: -> [49:34]
// CF: -> [50:50]

// Insert CF, Destination already at [15:0]
Tmp = _Bfi(8, 1, 16, Dest, CF);

// First Cascade, copies 17 bits from itself.
Tmp = _Bfi(8, 17, 17, Tmp, Tmp);

// Final Cascade, copies 17 bits from itself again.
Tmp = _Bfi(8, 17, 34, Tmp, Tmp);
}

// Entire bitfield has been setup
Expand Down

0 comments on commit ed75c19

Please sign in to comment.