Skip to content

Commit

Permalink
Armjit: Combine mul.s + neg.s to VNMUL. Implement VNMUL, VNMLA, VNMLS.
Browse files Browse the repository at this point in the history
I had implemented mul.s + add/sub.s + add/sub.s -> VADD/VSUB + V(N)ML(A/S). Turns out it doesn't happen enough though (once or twice per game).
  • Loading branch information
xsacha committed Jun 8, 2013
1 parent c2dcebf commit f21218c
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 24 deletions.
55 changes: 32 additions & 23 deletions Common/ArmEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,24 +906,30 @@ struct VFPEnc
// Double/single, Neon
const VFPEnc VFPOps[][2] = {
{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
{{0xE0, 0xA4}, {0x22, 0xD1}}, // 1: VMLS
{{0xE3, 0xA0}, {0x20, 0xD0}}, // 2: VADD
{{0xE3, 0xA4}, {0x22, 0xD0}}, // 3: VSUB
{{0xE2, 0xA0}, {0x30, 0xD1}}, // 4: VMUL
{{0xEB, 0xAC}, { -1 /* 0x3B */, -1 /* 0x70 */}}, // 5: VABS(Vn(0x0) used for encoding)
{{0xE8, 0xA0}, { -1, -1}}, // 6: VDIV
{{0xEB, 0xA4}, { -1 /* 0x3B */, -1 /* 0x78 */}}, // 7: VNEG(Vn(0x1) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 8: VSQRT (Vn(0x1) used for encoding)
{{0xEB, 0xA4}, { -1, -1}}, // 9: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 10: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{ -1, -1}, {0x3B, 0x30}}, // 11: VABSi
{{0xE1, 0xA4}, { -1, -1}}, // 1: VNMLA
{{0xE0, 0xA4}, {0x22, 0xD1}}, // 2: VMLS
{{0xE1, 0xA0}, { -1, -1}}, // 3: VNMLS
{{0xE3, 0xA0}, {0x20, 0xD0}}, // 4: VADD
{{0xE3, 0xA4}, {0x22, 0xD0}}, // 5: VSUB
{{0xE2, 0xA0}, {0x30, 0xD1}}, // 6: VMUL
{{0xE2, 0xA4}, { -1, -1}}, // 7: VNMUL
{{0xEB, 0xAC}, { -1 /* 0x3B */, -1 /* 0x70 */}}, // 8: VABS(Vn(0x0) used for encoding)
{{0xE8, 0xA0}, { -1, -1}}, // 9: VDIV
{{0xEB, 0xA4}, { -1 /* 0x3B */, -1 /* 0x78 */}}, // 10: VNEG(Vn(0x1) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 11: VSQRT (Vn(0x1) used for encoding)
{{0xEB, 0xA4}, { -1, -1}}, // 12: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{0xEB, 0xAC}, { -1, -1}}, // 13: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
{{ -1, -1}, {0x3B, 0x30}}, // 14: VABSi
};
const char *VFPOpNames[] = {
"VMLA",
"VNMLA",
"VMLS",
"VNMLS",
"VADD",
"VSUB",
"VMUL",
"VNMUL",
"VABS",
"VDIV",
"VNEG",
Expand Down Expand Up @@ -993,18 +999,21 @@ void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
Write32(cond | (enc.opc1 << 20) | VnEnc | VdEnc | (enc.opc2 << 4) | (quad_reg << 6) | (double_reg << 8) | VmEnc);
}
void ARMXEmitter::VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(0, Vd, Vn, Vm); }
void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(5, Vd, D0, Vm); }
void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(7, Vd, D1, Vm); }
void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D1, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(9, Vd, D4, Vm); }
void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D4, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(9, Vd, D5, D0); }
void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(10, Vd, D5, D0); }
void ARMXEmitter::VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
void ARMXEmitter::VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(5, Vd, Vn, Vm); }
void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
void ARMXEmitter::VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(7, Vd, Vn, Vm); }
void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D0, Vm); }
void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(9, Vd, Vn, Vm); }
void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D1, Vm); }
void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(11, Vd, D1, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(12, Vd, D4, Vm); }
void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(13, Vd, D4, Vm); }
void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(12, Vd, D5, D0); }
void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(13, Vd, D5, D0); }

void ARMXEmitter::VLDR(ARMReg Dest, ARMReg Base, s16 offset)
{
Expand Down
4 changes: 4 additions & 0 deletions Common/ArmEmitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,10 @@ class ARMXEmitter
// Compares against zero
void VCMP(ARMReg Vd);
void VCMPE(ARMReg Vd);

void VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VSQRT(ARMReg Vd, ARMReg Vm);

Expand Down
18 changes: 17 additions & 1 deletion Core/MIPS/ARM/ArmCompFPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "Core/Config.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/MIPSTables.h"

#include "ArmJit.h"
#include "ArmRegCache.h"
Expand Down Expand Up @@ -52,7 +53,22 @@ void Jit::Comp_FPU3op(u32 op)
{
case 0: VADD(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) + F(ft); //add
case 1: VSUB(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) - F(ft); //sub
case 2: VMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) * F(ft); //mul
case 2: { //F(fd) = F(fs) * F(ft); //mul
u32 nextOp = Memory::Read_Instruction(js.compilerPC + 4);
// Optimise possible if destination is the same
if (fd == ((nextOp>>6) & 0x1F)) {
// VMUL + VNEG -> VNMUL
if (!strcmp(MIPSGetName(nextOp), "neg.s")) {
if (fd == ((nextOp>>11) & 0x1F)) {
VNMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft));
EatInstruction(nextOp);
}
return;
}
}
VMUL(fpr.R(fd), fpr.R(fs), fpr.R(ft));
break;
}
case 3: VDIV(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) / F(ft); //div
default:
DISABLE;
Expand Down

2 comments on commit f21218c

@hrydgard
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh, yeah, if it only was once or twice per frame, the optimization probably wasn't worth your time. Good to have those ops added to the emitter though.

@xsacha
Copy link
Collaborator Author

@xsacha xsacha commented on f21218c Jun 8, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In certain games, mul.s + add.s + sub.s is used once in a loop to calculate the position of an object (so once per frame).
Unfortunately most of the uses were with mul.s s0, s0, s1. The overlap means I can't use it :(

Here's the code for mul.s + add/sub.s + add/sub.s in case anyone can make it more useful: xsacha@f433f40#L2R74

Please sign in to comment.