From d399c4a4708b4022ae159388098c09d693c01968 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 6 May 2016 23:45:37 +0200
Subject: [PATCH 01/77] Initial commit for IRJit

---
 Core/Config.cpp                   |   1 +
 Core/Config.h                     |   1 +
 Core/Core.vcxproj                 |  12 +
 Core/Core.vcxproj.filters         |  39 +++
 Core/CoreParameter.h              |   1 +
 Core/MIPS/IR/IRAsm.cpp            |  28 ++
 Core/MIPS/IR/IRCompALU.cpp        | 420 ++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRCompBranch.cpp     | 363 ++++++++++++++++++++++++++
 Core/MIPS/IR/IRCompFPU.cpp        | 226 ++++++++++++++++
 Core/MIPS/IR/IRCompLoadStore.cpp  | 162 ++++++++++++
 Core/MIPS/IR/IRCompVFPU.cpp       | 326 +++++++++++++++++++++++
 Core/MIPS/IR/IRInst.cpp           | 316 ++++++++++++++++++++++
 Core/MIPS/IR/IRInst.h             | 260 ++++++++++++++++++
 Core/MIPS/IR/IRJit.cpp            | 333 +++++++++++++++++++++++
 Core/MIPS/IR/IRJit.h              | 276 ++++++++++++++++++++
 Core/MIPS/IR/IRRegCache.cpp       |  46 ++++
 Core/MIPS/IR/IRRegCache.h         |  43 +++
 Core/MIPS/JitCommon/JitCommon.cpp |  16 +-
 Core/MIPS/MIPS.h                  |   6 +-
 19 files changed, 2869 insertions(+), 6 deletions(-)
 create mode 100644 Core/MIPS/IR/IRAsm.cpp
 create mode 100644 Core/MIPS/IR/IRCompALU.cpp
 create mode 100644 Core/MIPS/IR/IRCompBranch.cpp
 create mode 100644 Core/MIPS/IR/IRCompFPU.cpp
 create mode 100644 Core/MIPS/IR/IRCompLoadStore.cpp
 create mode 100644 Core/MIPS/IR/IRCompVFPU.cpp
 create mode 100644 Core/MIPS/IR/IRInst.cpp
 create mode 100644 Core/MIPS/IR/IRInst.h
 create mode 100644 Core/MIPS/IR/IRJit.cpp
 create mode 100644 Core/MIPS/IR/IRJit.h
 create mode 100644 Core/MIPS/IR/IRRegCache.cpp
 create mode 100644 Core/MIPS/IR/IRRegCache.h
diff --git a/Core/Config.cpp b/Core/Config.cpp
index a5b4bb79d557..6faa97a0759b 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -354,6 +354,7 @@ static bool DefaultSasThread() {
 
 static ConfigSetting cpuSettings[] = {
 	ReportedConfigSetting("Jit", &g_Config.bJit, &DefaultJit, true, true),
+	ReportedConfigSetting("CPUCore", &g_Config.bJit, &DefaultJit, true, true),
 	ReportedConfigSetting("SeparateCPUThread", &g_Config.bSeparateCPUThread, false, true, true),
 	ReportedConfigSetting("SeparateSASThread", &g_Config.bSeparateSASThread, &DefaultSasThread, true, true),
 	ReportedConfigSetting("SeparateIOThread", &g_Config.bSeparateIOThread, true, true, true),
diff --git a/Core/Config.h b/Core/Config.h
index b2cfc1351e45..825091bd0cc6 100644
--- a/Core/Config.h
+++ b/Core/Config.h
@@ -120,6 +120,7 @@ struct Config {
 	bool bIgnoreBadMemAccess;
 	bool bFastMemory;
 	bool bJit;
+	int iCpuCore;
 	bool bCheckForNewVersion;
 	bool bForceLagSync;
 	bool bFuncReplacements;
diff --git a/Core/Core.vcxproj b/Core/Core.vcxproj
index 83552556d0e1..af4b459fb1b9 100644
--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@@ -181,6 +181,15 @@
     <ClCompile Include="..\ext\udis86\syn-intel.c" />
     <ClCompile Include="..\ext\udis86\syn.c" />
     <ClCompile Include="..\ext\udis86\udis86.c" />
+    <ClCompile Include="MIPS\IR\IRAsm.cpp" />
+    <ClCompile Include="MIPS\IR\IRCompALU.cpp" />
+    <ClCompile Include="MIPS\IR\IRCompBranch.cpp" />
+    <ClCompile Include="MIPS\IR\IRCompFPU.cpp" />
+    <ClCompile Include="MIPS\IR\IRCompLoadStore.cpp" />
+    <ClCompile Include="MIPS\IR\IRCompVFPU.cpp" />
+    <ClCompile Include="MIPS\IR\IRInst.cpp" />
+    <ClCompile Include="MIPS\IR\IRJit.cpp" />
+    <ClCompile Include="MIPS\IR\IRRegCache.cpp" />
     <ClCompile Include="TextureReplacer.cpp" />
     <ClCompile Include="Compatibility.cpp" />
     <ClCompile Include="Config.cpp" />
@@ -507,6 +516,9 @@
     <ClInclude Include="..\ext\udis86\types.h" />
     <ClInclude Include="..\ext\udis86\udint.h" />
     <ClInclude Include="..\ext\udis86\udis86.h" />
+    <ClInclude Include="MIPS\IR\IRInst.h" />
+    <ClInclude Include="MIPS\IR\IRJit.h" />
+    <ClInclude Include="MIPS\IR\IRRegCache.h" />
     <ClInclude Include="TextureReplacer.h" />
     <ClInclude Include="Compatibility.h" />
     <ClInclude Include="Config.h" />
diff --git a/Core/Core.vcxproj.filters b/Core/Core.vcxproj.filters
index d4ba376f94db..99af2a2696fe 100644
--- a/Core/Core.vcxproj.filters
+++ b/Core/Core.vcxproj.filters
@@ -67,6 +67,9 @@
     <Filter Include="FileLoaders">
       <UniqueIdentifier>{67687dba-8313-4442-b4eb-4be8c4867b65}</UniqueIdentifier>
     </Filter>
+    <Filter Include="MIPS\IR">
+      <UniqueIdentifier>{119ac973-e457-4025-9e1e-4fb34022ae23}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="ELF\ElfReader.cpp">
@@ -634,6 +637,33 @@
     <ClCompile Include="TextureReplacer.cpp">
       <Filter>Core</Filter>
     </ClCompile>
+    <ClCompile Include="MIPS\IR\IRAsm.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRCompALU.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRCompBranch.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRCompFPU.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRCompLoadStore.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRCompVFPU.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRJit.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRRegCache.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
+    <ClCompile Include="MIPS\IR\IRInst.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ELF\ElfReader.h">
@@ -1179,6 +1209,15 @@
     <ClInclude Include="TextureReplacer.h">
       <Filter>Core</Filter>
     </ClInclude>
+    <ClInclude Include="MIPS\IR\IRJit.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
+    <ClInclude Include="MIPS\IR\IRRegCache.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
+    <ClInclude Include="MIPS\IR\IRInst.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="CMakeLists.txt" />
diff --git a/Core/CoreParameter.h b/Core/CoreParameter.h
index ac33c91f5827..1517b50c03c1 100644
--- a/Core/CoreParameter.h
+++ b/Core/CoreParameter.h
@@ -24,6 +24,7 @@
 enum CPUCore {
 	CPU_INTERPRETER,
 	CPU_JIT,
+	CPU_IRJIT,
 };
 
 enum GPUCore {
diff --git a/Core/MIPS/IR/IRAsm.cpp b/Core/MIPS/IR/IRAsm.cpp
new file mode 100644
index 000000000000..f5d9c7ad3157
--- /dev/null
+++ b/Core/MIPS/IR/IRAsm.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) 2015- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "base/logging.h"
+
+#include "Core/MemMap.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/System.h"
+#include "Core/CoreTiming.h"
+#include "Common/MemoryUtil.h"
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"
+
+
diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
new file mode 100644
index 000000000000..67059e371e5e
--- /dev/null
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -0,0 +1,420 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <algorithm>
+
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Common/CPUDetect.h"
+
+using namespace MIPSAnalyst;
+
+#define _RS MIPS_GET_RS(op)
+#define _RT MIPS_GET_RT(op)
+#define _RD MIPS_GET_RD(op)
+#define _FS MIPS_GET_FS(op)
+#define _FT MIPS_GET_FT(op)
+#define _FD MIPS_GET_FD(op)
+#define _SA MIPS_GET_SA(op)
+#define _POS  ((op>> 6) & 0x1F)
+#define _SIZE ((op>>11) & 0x1F)
+#define _IMM16 (signed short)(op & 0xFFFF)
+#define _IMM26 (op & 0x03FFFFFF)
+
+// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
+// Currently known non working ones should have DISABLE.
+
+// #define CONDITIONAL_DISABLE { Comp_Generic(op); return; }
+#define CONDITIONAL_DISABLE ;
+#define DISABLE { Comp_Generic(op); return; }
+
+namespace MIPSComp {
+
+void IRJit::CompImmLogic(MIPSGPReg rs, MIPSGPReg rt, u32 uimm, IROp OP) {
+	if (gpr.IsImm(rs)) {
+		switch (OP) {
+		case IROp::AddConst: gpr.SetImm(rt, rs + uimm); break;
+		case IROp::SubConst: gpr.SetImm(rt, rs - uimm); break;
+		case IROp::AndConst: gpr.SetImm(rt, rs & uimm); break;
+		case IROp::OrConst: gpr.SetImm(rt, rs | uimm); break;
+		case IROp::XorConst: gpr.SetImm(rt, rs ^ uimm); break;
+		}
+	} else {
+		gpr.MapDirtyIn(rt, rs);
+		ir.Write(OP, rt, ir.AddConstant(uimm));
+	}
+}
+
+void IRJit::Comp_IType(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	s32 simm = (s32)(s16)(op & 0xFFFF);  // sign extension
+	u32 uimm = op & 0xFFFF;
+	u32 suimm = (u32)(s32)simm;
+
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rs = _RS;
+
+	// noop, won't write to ZERO.
+	if (rt == 0)
+		return;
+
+	switch (op >> 26) {
+	case 8:	// same as addiu?
+	case 9:	// R(rt) = R(rs) + simm; break;	//addiu
+		// Special-case for small adjustments of pointerified registers. Commonly for SP but happens for others.
+		if (simm >= 0) {
+			CompImmLogic(rs, rt, simm, IROp::AddConst);
+		} else if (simm < 0) {
+			CompImmLogic(rs, rt, -simm, IROp::SubConst);
+		}
+		break;
+
+	case 12: CompImmLogic(rs, rt, uimm, IROp::AndConst); break;
+	case 13: CompImmLogic(rs, rt, uimm, IROp::OrConst); break;
+	case 14: CompImmLogic(rs, rt, uimm, IROp::XorConst); break;
+
+	case 10: // R(rt) = (s32)R(rs) < simm; break; //slti
+		if (gpr.IsImm(rs)) {
+			gpr.SetImm(rt, (s32)gpr.GetImm(rs) < simm ? 1 : 0);
+			break;
+		}
+		gpr.MapDirtyIn(rt, rs);
+		// Grab the sign bit (< 0) as 1/0.  Slightly faster than a shift.
+		ir.Write(IROp::Slt, rt, rs, ir.AddConstant(simm));
+		break;
+
+	case 11: // R(rt) = R(rs) < suimm; break; //sltiu
+		if (gpr.IsImm(rs)) {
+			gpr.SetImm(rt, gpr.GetImm(rs) < suimm ? 1 : 0);
+			break;
+		}
+		gpr.MapDirtyIn(rt, rs);
+		ir.Write(IROp::SltU, rt, rs, ir.AddConstant(suimm));
+		break;
+
+	case 15: // R(rt) = uimm << 16;	 //lui
+		gpr.SetImm(rt, uimm << 16);
+		break;
+
+	default:
+		Comp_Generic(op);
+		break;
+	}
+}
+
+void IRJit::Comp_RType2(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rd = _RD;
+
+	// Don't change $zr.
+	if (rd == 0)
+		return;
+
+	switch (op & 63) {
+	case 22: //clz
+		gpr.MapDirtyIn(rd, rs);
+		ir.Write(IROp::Clz, rd, rs);
+		break;
+	case 23: //clo
+		gpr.MapDirtyIn(rd, rs);
+		ir.Write(IROp::Not, IRTEMP_0, rs);
+		ir.Write(IROp::Clz, rd, IRTEMP_0);
+		break;
+	default:
+		DISABLE;
+	}
+}
+
+void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp constOp, bool symmetric) {
+	if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+		switch (op) {
+		case IROp::Add: gpr.SetImm(rd, gpr.GetImm(rs) + gpr.GetImm(rt)); break;
+		case IROp::Sub: gpr.SetImm(rd, gpr.GetImm(rs) - gpr.GetImm(rt)); break;
+		case IROp::And: gpr.SetImm(rd, gpr.GetImm(rs) & gpr.GetImm(rt)); break;
+		case IROp::Or: gpr.SetImm(rd, gpr.GetImm(rs) | gpr.GetImm(rt)); break;
+		case IROp::Xor: gpr.SetImm(rd, gpr.GetImm(rs) ^ gpr.GetImm(rt)); break;
+		}
+		return;
+	}
+
+	if (gpr.IsImm(rt) || (gpr.IsImm(rs) && symmetric)) {
+		MIPSGPReg lhs = gpr.IsImm(rs) ? rt : rs;
+		MIPSGPReg rhs = gpr.IsImm(rs) ? rs : rt;
+		u32 rhsImm = gpr.GetImm(rhs);
+		gpr.MapDirtyIn(rd, lhs);
+		ir.Write(constOp, rd, lhs, ir.AddConstant(rhsImm));
+		// If rd is rhs, we may have lost it in the MapDirtyIn().  lhs was kept.
+		// This means the rhsImm value was never flushed to rhs, and would be garbage.
+		if (rd == rhs) {
+			// Luckily, it was just an imm.
+			gpr.SetImm(rhs, rhsImm);
+		}
+	}
+
+	// Can't do the RSB optimization on ARM64 - no RSB!
+
+	// Generic solution.  If it's an imm, better to flush at this point.
+	gpr.MapDirtyInIn(rd, rs, rt);
+	ir.Write(op, rd, rs, rt);
+}
+
+void IRJit::Comp_RType3(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rd = _RD;
+
+	// noop, won't write to ZERO.
+	if (rd == 0)
+		return;
+
+	switch (op & 63) {
+	case 10: //if (!R(rt)) R(rd) = R(rs);       break; //movz
+		gpr.MapDirtyInIn(rd, rt, rs);
+		ir.Write(IROp::MovZ, rd, rt, rs);
+		break;
+	case 11:// if (R(rt)) R(rd) = R(rs);		break; //movn
+		gpr.MapDirtyInIn(rd, rt, rs);
+		ir.Write(IROp::MovNZ, rd, rt, rs);
+		break;
+
+	case 32: //R(rd) = R(rs) + R(rt);           break; //add
+	case 33: //R(rd) = R(rs) + R(rt);           break; //addu
+		CompType3(rd, rs, rt, IROp::Add, IROp::AddConst, true);
+		break;
+
+	case 34: //R(rd) = R(rs) - R(rt);           break; //sub
+	case 35: //R(rd) = R(rs) - R(rt);           break; //subu
+		CompType3(rd, rs, rt, IROp::Sub, IROp::SubConst, false);
+		break;
+
+	case 36: //R(rd) = R(rs) & R(rt);           break; //and
+		CompType3(rd, rs, rt, IROp::And, IROp::AndConst, true);
+		break;
+	case 37: //R(rd) = R(rs) | R(rt);           break; //or
+		CompType3(rd, rs, rt, IROp::Or, IROp::OrConst, true);
+		break;
+	case 38: //R(rd) = R(rs) ^ R(rt);           break; //xor/eor	
+		CompType3(rd, rs, rt, IROp::Xor, IROp::XorConst, true);
+		break;
+
+	case 39: // R(rd) = ~(R(rs) | R(rt));       break; //nor
+		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+			gpr.SetImm(rd, ~(gpr.GetImm(rs) | gpr.GetImm(rt)));
+		} 
+
+		ir.Write(IROp::Or, IRTEMP_0, rs, rt);
+		ir.Write(IROp::Not, rd, IRTEMP_0);
+		break;
+
+	case 42: //R(rd) = (int)R(rs) < (int)R(rt); break; //slt
+		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+			gpr.SetImm(rd, (s32)gpr.GetImm(rs) < (s32)gpr.GetImm(rt));
+		} else {
+			gpr.MapDirtyInIn(rd, rt, rs);
+			ir.Write(IROp::Slt, rd, rs, rt);
+		}
+		break;
+
+	case 43: //R(rd) = R(rs) < R(rt);           break; //sltu
+		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+			gpr.SetImm(rd, gpr.GetImm(rs) < gpr.GetImm(rt));
+		} else {
+			gpr.MapDirtyInIn(rd, rt, rs);
+			ir.Write(IROp::SltU, rd, rs, rt);
+		}
+		break;
+
+	case 44: //R(rd) = max(R(rs), R(rt);        break; //max
+		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+			gpr.SetImm(rd, std::max(gpr.GetImm(rs), gpr.GetImm(rt)));
+			break;
+		}
+		gpr.MapDirtyInIn(rd, rs, rt);
+		ir.Write(IROp::Max, rd, rs, rt);
+		break;
+
+	case 45: //R(rd) = min(R(rs), R(rt));       break; //min
+		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
+			gpr.SetImm(rd, std::min(gpr.GetImm(rs), gpr.GetImm(rt)));
+			break;
+		}
+		gpr.MapDirtyInIn(rd, rs, rt);
+		ir.Write(IROp::Min, rd, rs, rt);
+		break;
+
+	default:
+		Comp_Generic(op);
+		break;
+	}
+}
+
+void IRJit::CompShiftImm(MIPSOpcode op, IROp shiftOpConst, int sa) {
+	MIPSGPReg rd = _RD;
+	MIPSGPReg rt = _RT;
+	if (gpr.IsImm(rt)) {
+		switch (shiftOpConst) {
+		case IROp::ShlImm:
+			gpr.SetImm(rd, gpr.GetImm(rt) << sa);
+			break;
+		case IROp::ShrImm:
+			gpr.SetImm(rd, gpr.GetImm(rt) >> sa);
+			break;
+		case IROp::SarImm:
+			gpr.SetImm(rd, (int)gpr.GetImm(rt) >> sa);
+			break;
+		case IROp::RorImm:
+			gpr.SetImm(rd, (gpr.GetImm(rt) >> sa) | (gpr.GetImm(rt) << (32 - sa)));
+			break;
+		default:
+			DISABLE;
+		}
+	} else {
+		gpr.MapDirtyIn(rd, rt);
+		ir.Write(shiftOpConst, rd, rt, sa);
+	}
+}
+
+void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpConst) {
+	MIPSGPReg rd = _RD;
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rs = _RS;
+	if (gpr.IsImm(rs)) {
+		int sa = gpr.GetImm(rs) & 0x1F;
+		CompShiftImm(op, shiftOpConst, sa);
+		return;
+	}
+	gpr.MapDirtyInIn(rd, rs, rt);
+	// Not sure if ARM64 wraps like this so let's do it for it.  (TODO: According to the ARM ARM, it will indeed mask for us so this is not necessary)
+	// ANDI2R(SCRATCH1, gpr.R(rs), 0x1F, INVALID_REG);
+	ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
+	ir.Write(shiftOp, rd, rt, IRTEMP_0);
+}
+
+void IRJit::Comp_ShiftType(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rd = _RD;
+	int fd = _FD;
+	int sa = _SA;
+
+	// noop, won't write to ZERO.
+	if (rd == 0)
+		return;
+
+	// WARNING : ROTR
+	switch (op & 0x3f) {
+	case 0: CompShiftImm(op, IROp::Shl, sa); break; //sll
+	case 2: CompShiftImm(op, rs == 1 ? IROp::Ror : IROp::Shr, sa); break;	//srl
+	case 3: CompShiftImm(op, IROp::Sar, sa); break; //sra
+	case 4: CompShiftVar(op, IROp::Shl, IROp::ShlImm); break; //sllv
+	case 6: CompShiftVar(op, (fd == 1 ? IROp::Ror : IROp::Shr), (fd == 1 ? IROp::RorImm : IROp::ShrImm)); break; //srlv
+	case 7: CompShiftVar(op, IROp::Sar, IROp::SarImm); break; //srav
+	default:
+		DISABLE;
+		break;
+	}
+}
+
+void IRJit::Comp_Special3(MIPSOpcode op) {
+	DISABLE;
+}
+
+void IRJit::Comp_Allegrex(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rd = _RD;
+	// Don't change $zr.
+	if (rd == 0)
+		return;
+
+	switch ((op >> 6) & 31) {
+	case 16: // seb	// R(rd) = (u32)(s32)(s8)(u8)R(rt);
+		if (gpr.IsImm(rt)) {
+			gpr.SetImm(rd, (s32)(s8)(u8)gpr.GetImm(rt));
+			return;
+		}
+		gpr.MapDirtyIn(rd, rt);
+		ir.Write(IROp::Ext8to32, rd, rt);
+		break;
+
+	case 24: // seh
+		if (gpr.IsImm(rt)) {
+			gpr.SetImm(rd, (s32)(s16)(u16)gpr.GetImm(rt));
+			return;
+		}
+		gpr.MapDirtyIn(rd, rt);
+		ir.Write(IROp::Ext16to32, rd, rt);
+		break;
+
+	case 20: //bitrev
+	default:
+		Comp_Generic(op);
+		return;
+	}
+}
+
+void IRJit::Comp_Allegrex2(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rd = _RD;
+	// Don't change $zr.
+	if (rd == 0)
+		return;
+
+	switch (op & 0x3ff) {
+	case 0xA0: //wsbh
+		if (gpr.IsImm(rt)) {
+			gpr.SetImm(rd, ((gpr.GetImm(rt) & 0xFF00FF00) >> 8) | ((gpr.GetImm(rt) & 0x00FF00FF) << 8));
+		} else {
+			gpr.MapDirtyIn(rd, rt);
+			ir.Write(IROp::BSwap16, rd, rt);
+		}
+		break;
+	case 0xE0: //wsbw
+		if (gpr.IsImm(rt)) {
+			gpr.SetImm(rd, swap32(gpr.GetImm(rt)));
+		} else {
+			gpr.MapDirtyIn(rd, rt);
+			ir.Write(IROp::BSwap16, rd, rt);
+		}
+		break;
+	default:
+		Comp_Generic(op);
+		break;
+	}
+}
+
+void IRJit::Comp_MulDivType(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rd = _RD;
+
+	// Note that in all cases below, LO is actually mapped to HI:LO.
+	// That is, the host reg is 64 bits and has HI at the top.
+	// HI is not mappable.
+
+	DISABLE;
+}
+
+}
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
new file mode 100644
index 000000000000..16c7245b82dc
--- /dev/null
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -0,0 +1,363 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "profiler/profiler.h"
+
+#include "Core/Reporting.h"
+#include "Core/Config.h"
+#include "Core/MemMap.h"
+#include "Core/HLE/HLE.h"
+#include "Core/HLE/HLETables.h"
+
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Core/MIPS/MIPSAnalyst.h"
+#include "Core/MIPS/MIPSTables.h"
+
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Core/MIPS/JitCommon/JitBlockCache.h"
+
+#include "Common/Arm64Emitter.h"
+
+#define _RS MIPS_GET_RS(op)
+#define _RT MIPS_GET_RT(op)
+#define _RD MIPS_GET_RD(op)
+#define _FS MIPS_GET_FS(op)
+#define _FT MIPS_GET_FT(op)
+#define _FD MIPS_GET_FD(op)
+#define _SA MIPS_GET_SA(op)
+#define _POS  ((op>> 6) & 0x1F)
+#define _SIZE ((op>>11) & 0x1F)
+#define _IMM16 (signed short)(op & 0xFFFF)
+#define _IMM26 (op & 0x03FFFFFF)
+
+#define LOOPOPTIMIZATION 0
+
+using namespace MIPSAnalyst;
+
+namespace MIPSComp
+{
+	using namespace Arm64Gen;
+
+void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
+{
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in RSRTComp delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+	int offset = _IMM16 << 2;
+	MIPSGPReg rt = _RT;
+	MIPSGPReg rs = _RS;
+	u32 targetAddr = GetCompilerPC() + offset + 4;
+
+	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
+	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs);
+
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
+	MIPSGPReg lhs = rs;
+	MIPSGPReg rhs = rt;
+	if (!delaySlotIsNice) {
+		ir.Write(IROp::Mov, IRTEMP_0, rs);
+		ir.Write(IROp::Mov, IRTEMP_1, rt);
+		lhs = (MIPSGPReg)IRTEMP_0;
+		rhs = (MIPSGPReg)IRTEMP_1;
+	}
+
+	if (!likely)
+		CompileDelaySlot();
+
+	gpr.MapInIn(lhs, rhs);
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs, rhs);
+	// This makes the block "impure" :(
+	if (likely)
+		CompileDelaySlot();
+
+	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+
+	js.compiling = false;
+}
+
+void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely) {
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in RSZeroComp delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+	int offset = _IMM16 << 2;
+	MIPSGPReg rs = _RS;
+	u32 targetAddr = GetCompilerPC() + offset + 4;
+
+	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
+	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
+
+	if (!likely && delaySlotIsNice)
+		CompileDelaySlot();
+	int lhs = rs;
+	gpr.MapIn(rs);
+	if (!delaySlotIsNice) {
+		ir.Write(IROp::Mov, IRTEMP_0, rs);
+		lhs = IRTEMP_0;
+	}
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs);
+	if (likely) {
+		CompileDelaySlot();
+	}
+	// Taken
+	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+	js.compiling = false;
+}
+
+void IRJit::Comp_RelBranch(MIPSOpcode op) {
+	// The CC flags here should be opposite of the actual branch becuase they skip the branching action.
+	switch (op >> 26) {
+	case 4: BranchRSRTComp(op, IRComparison::NotEqual, false); break;//beq
+	case 5: BranchRSRTComp(op, IRComparison::Equal,  false); break;//bne
+
+	case 6: BranchRSZeroComp(op, IRComparison::Greater, false, false); break;//blez
+	case 7: BranchRSZeroComp(op, IRComparison::LessEqual, false, false); break;//bgtz
+
+	case 20: BranchRSRTComp(op, IRComparison::NotEqual, true); break;//beql
+	case 21: BranchRSRTComp(op, IRComparison::Equal,  true); break;//bnel
+
+	case 22: BranchRSZeroComp(op, IRComparison::Greater, false, true); break;//blezl
+	case 23: BranchRSZeroComp(op, IRComparison::LessEqual, false, true); break;//bgtzl
+
+	default:
+		_dbg_assert_msg_(CPU,0,"Trying to compile instruction that can't be compiled");
+		break;
+	}
+}
+
+void IRJit::Comp_RelBranchRI(MIPSOpcode op) {
+	switch ((op >> 16) & 0x1F) {
+	case 0: BranchRSZeroComp(op, IRComparison::GreaterEqual, false, false); break; //if ((s32)R(rs) <  0) DelayBranchTo(addr); else PC += 4; break;//bltz
+	case 1: BranchRSZeroComp(op, IRComparison::Less, false, false); break; //if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 4; break;//bgez
+	case 2: BranchRSZeroComp(op, IRComparison::GreaterEqual, false, true);  break; //if ((s32)R(rs) <  0) DelayBranchTo(addr); else PC += 8; break;//bltzl
+	case 3: BranchRSZeroComp(op, IRComparison::Less, false, true);  break; //if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 8; break;//bgezl
+	case 16: BranchRSZeroComp(op, IRComparison::GreaterEqual, true, false); break;  //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) <  0) DelayBranchTo(addr); else PC += 4; break;//bltzal
+	case 17: BranchRSZeroComp(op, IRComparison::Less, true, false);  break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 4; break;//bgezal
+	case 18: BranchRSZeroComp(op, IRComparison::GreaterEqual, true, true);  break;  //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) <  0) DelayBranchTo(addr); else SkipLikely(); break;//bltzall
+	case 19: BranchRSZeroComp(op, IRComparison::Less, true, true);   break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) >= 0) DelayBranchTo(addr); else SkipLikely(); break;//bgezall
+	default:
+		_dbg_assert_msg_(CPU,0,"Trying to compile instruction that can't be compiled");
+		break;
+	}
+}
+
+// If likely is set, discard the branch slot if NOT taken.
+void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in FPFlag delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+	int offset = _IMM16 << 2;
+	u32 targetAddr = GetCompilerPC() + offset + 4;
+
+	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
+	ir.Write(IROp::FpCondToReg, IRTEMP_0);
+	if (!likely)
+		CompileDelaySlot();
+
+	FlushAll();
+	// Not taken
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), IRTEMP_0, 0);
+	// Taken
+	if (likely)
+		CompileDelaySlot();
+	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+	js.compiling = false;
+}
+
+void IRJit::Comp_FPUBranch(MIPSOpcode op) {
+	switch((op >> 16) & 0x1f) {
+	case 0:	BranchFPFlag(op, IRComparison::NotEqual, false); break;  // bc1f
+	case 1: BranchFPFlag(op, IRComparison::Equal, false); break;  // bc1t
+	case 2: BranchFPFlag(op, IRComparison::NotEqual, true);  break;  // bc1fl
+	case 3: BranchFPFlag(op, IRComparison::Equal, true);  break;  // bc1tl
+	default:
+		_dbg_assert_msg_(CPU, 0, "Trying to interpret instruction that can't be interpreted");
+		break;
+	}
+}
+
+// If likely is set, discard the branch slot if NOT taken.
+void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in VFPU delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+	int offset = _IMM16 << 2;
+	u32 targetAddr = GetCompilerPC() + offset + 4;
+
+	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
+
+	ir.Write(IROp::VfpCondToReg, IRTEMP_0);
+
+	// Sometimes there's a VFPU branch in a delay slot (Disgaea 2: Dark Hero Days, Zettai Hero Project, La Pucelle)
+	// The behavior is undefined - the CPU may take the second branch even if the first one passes.
+	// However, it does consistently try each branch, which these games seem to expect.
+	bool delaySlotIsBranch = MIPSCodeUtils::IsVFPUBranch(delaySlotOp);
+	if (!likely)
+		CompileDelaySlot();
+
+	if (delaySlotIsBranch && (signed short)(delaySlotOp & 0xFFFF) != (signed short)(op & 0xFFFF) - 1)
+		ERROR_LOG_REPORT(JIT, "VFPU branch in VFPU delay slot at %08x with different target", GetCompilerPC());
+
+	int imm3 = (op >> 18) & 7;
+	
+	u32 notTakenTarget = GetCompilerPC() + (delaySlotIsBranch ? 4 : 8);
+
+	ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(imm3));
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(notTakenTarget), IRTEMP_0, 0);
+
+	if (likely)
+		CompileDelaySlot();
+
+	// Taken
+	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+	js.compiling = false;
+}
+
+void IRJit::Comp_VBranch(MIPSOpcode op) {
+	switch ((op >> 16) & 3) {
+	case 0:	BranchVFPUFlag(op, IRComparison::NotEqual, false); break;  // bvf
+	case 1: BranchVFPUFlag(op, IRComparison::Equal,  false); break;  // bvt
+	case 2: BranchVFPUFlag(op, IRComparison::NotEqual, true);  break;  // bvfl
+	case 3: BranchVFPUFlag(op, IRComparison::Equal,  true);  break;  // bvtl
+	}
+}
+
+void IRJit::Comp_Jump(MIPSOpcode op) {
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in Jump delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+
+	u32 off = _IMM26 << 2;
+	u32 targetAddr = (GetCompilerPC() & 0xF0000000) | off;
+
+	// Might be a stubbed address or something?
+	if (!Memory::IsValidAddress(targetAddr)) {
+		if (js.nextExit == 0) {
+			ERROR_LOG_REPORT(JIT, "Jump to invalid address: %08x", targetAddr);
+		} else {
+			js.compiling = false;
+		}
+		// TODO: Mark this block dirty or something?  May be indication it will be changed by imports.
+		return;
+	}
+
+	switch (op >> 26) {
+	case 2: //j
+		CompileDelaySlot();
+		FlushAll();
+		ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+		break;
+
+	case 3: //jal
+		if (ReplaceJalTo(targetAddr))
+			return;
+		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
+		CompileDelaySlot();
+		FlushAll();
+		ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
+		break;
+
+	default:
+		_dbg_assert_msg_(CPU,0,"Trying to compile instruction that can't be compiled");
+		break;
+	}
+	js.compiling = false;
+}
+
+void IRJit::Comp_JumpReg(MIPSOpcode op) {
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT(JIT, "Branch in JumpReg delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
+		return;
+	}
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rd = _RD;
+	bool andLink = (op & 0x3f) == 9 && rd != MIPS_REG_ZERO;
+
+	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
+	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
+	if (andLink && rs == rd)
+		delaySlotIsNice = false;
+
+	int destReg;
+	if (IsSyscall(delaySlotOp)) {
+		gpr.MapDirty(rs);
+		ir.Write(IROp::SetPC, 0, rs);
+		if (andLink)
+			gpr.SetImm(rd, GetCompilerPC() + 8);
+		CompileDelaySlot();
+		return;  // Syscall (delay slot) wrote exit code.
+	} else if (delaySlotIsNice) {
+		if (andLink)
+			gpr.SetImm(rd, GetCompilerPC() + 8);
+		CompileDelaySlot();
+		gpr.MapDirty(rs);
+		destReg = rs;  // Safe because FlushAll doesn't change any regs
+		FlushAll();
+	} else {
+		// Bad delay slot.
+		gpr.MapDirty(rs);
+		ir.Write(IROp::Mov, IRTEMP_0, rs);
+		destReg = IRTEMP_0;
+		if (andLink)
+			gpr.SetImm(rd, GetCompilerPC() + 8);
+		CompileDelaySlot();
+		FlushAll();
+	}
+
+	switch (op & 0x3f) 
+	{
+	case 8: //jr
+		break;
+	case 9: //jalr
+		break;
+	default:
+		_dbg_assert_msg_(CPU,0,"Trying to compile instruction that can't be compiled");
+		break;
+	}
+
+	ir.Write(IROp::ExitToReg, ir.AddConstant(js.downcountAmount), rs, 0);
+	js.compiling = false;
+}
+
+void IRJit::Comp_Syscall(MIPSOpcode op) {
+	// If we're in a delay slot, this is off by one.
+	const int offset = js.inDelaySlot ? -1 : 0;
+	RestoreRoundingMode();
+	js.downcountAmount = -offset;
+
+	FlushAll();
+
+	ir.Write(IROp::Syscall, 0, ir.AddConstant(op.encoding));
+
+	ApplyRoundingMode();
+	js.compiling = false;
+}
+
+void IRJit::Comp_Break(MIPSOpcode op)
+{
+	Comp_Generic(op);
+	js.compiling = false;
+}
+
+}   // namespace Mipscomp
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
new file mode 100644
index 000000000000..00a8ec63991c
--- /dev/null
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -0,0 +1,226 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "Core/Config.h"
+#include "Core/MemMap.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Core/MIPS/MIPSTables.h"
+
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Common/CPUDetect.h"
+
+#define _RS MIPS_GET_RS(op)
+#define _RT MIPS_GET_RT(op)
+#define _RD MIPS_GET_RD(op)
+#define _FS MIPS_GET_FS(op)
+#define _FT MIPS_GET_FT(op)
+#define _FD MIPS_GET_FD(op)
+#define _SA MIPS_GET_SA(op)
+#define _POS  ((op>> 6) & 0x1F)
+#define _SIZE ((op>>11) & 0x1F)
+#define _IMM16 (signed short)(op & 0xFFFF)
+#define _IMM26 (op & 0x03FFFFFF)
+
+
+// FPCR interesting bits:
+// 24: FZ (flush-to-zero)
+// 23:22: RMode (0 = nearest, 1 = +inf, 2 = -inf, 3 = zero)
+// not much else is interesting for us, but should be preserved.
+// To access: MRS Xt, FPCR ;  MSR FPCR, Xt
+
+
+// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
+// Currently known non working ones should have DISABLE.
+
+// #define CONDITIONAL_DISABLE { Comp_Generic(op); return; }
+#define CONDITIONAL_DISABLE ;
+#define DISABLE { Comp_Generic(op); return; }
+
+namespace MIPSComp {
+
+void IRJit::Comp_FPU3op(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+
+	int ft = _FT;
+	int fs = _FS;
+	int fd = _FD;
+
+	switch (op & 0x3f) {
+	case 0: ir.Write(IROp::FAdd, fd, fs, ft); break; //F(fd) = F(fs) + F(ft); //add
+	case 1: ir.Write(IROp::FSub, fd, fs, ft); break; //F(fd) = F(fs) - F(ft); //sub
+	case 2: ir.Write(IROp::FMul, fd, fs, ft); break; //F(fd) = F(fs) * F(ft); //mul
+	case 3: ir.Write(IROp::FDiv, fd, fs, ft); break; //F(fd) = F(fs) / F(ft); //div
+	default:
+		DISABLE;
+		return;
+	}
+}
+
+void IRJit::Comp_FPULS(MIPSOpcode op) {
+	DISABLE;
+}
+
+void IRJit::Comp_FPUComp(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+
+	int opc = op & 0xF;
+	if (opc >= 8) opc -= 8; // alias
+	if (opc == 0) {  // f, sf (signalling false)
+		gpr.SetImm(MIPS_REG_FPCOND, 0);
+		return;
+	}
+
+	int fs = _FS;
+	int ft = _FT;
+
+	IROp irOp;
+	switch (opc) {
+	case 1:      // un,  ngle (unordered)
+		irOp = IROp::FCmpUnordered;
+		break;
+	case 2:      // eq,  seq (equal, ordered)
+		irOp = IROp::FCmpEqual;
+		break;
+	case 3:      // ueq, ngl (equal, unordered)
+		irOp = IROp::FCmpEqualUnordered;
+		return;
+	case 4:      // olt, lt (less than, ordered)
+		irOp = IROp::FCmpLessOrdered;
+		break;
+	case 5:      // ult, nge (less than, unordered)
+		irOp = IROp::FCmpLessUnordered;
+		break;
+	case 6:      // ole, le (less equal, ordered)
+		irOp = IROp::FCmpLessEqualOrdered;
+		break;
+	case 7:      // ule, ngt (less equal, unordered)
+		irOp = IROp::FCmpLessEqualUnordered;
+		break;
+	default:
+		Comp_Generic(op);
+		return;
+	}
+	ir.Write(irOp, fs, ft);
+}
+
+void IRJit::Comp_FPU2op(MIPSOpcode op) {
+	CONDITIONAL_DISABLE;
+	int fs = _FS;
+	int fd = _FD;
+
+	switch (op & 0x3f) {
+	case 4:	//F(fd)	   = sqrtf(F(fs));            break; //sqrt
+		ir.Write(IROp::FSqrt, fd, fs);
+		break;
+	case 5:	//F(fd)    = fabsf(F(fs));            break; //abs
+		ir.Write(IROp::FAbs, fd, fs);
+		break;
+	case 6:	//F(fd)	   = F(fs);                   break; //mov
+		ir.Write(IROp::FMov, fd, fs);
+		break;
+	case 7:	//F(fd)	   = -F(fs);                  break; //neg
+		ir.Write(IROp::FNeg, fd, fs);
+		break;
+
+	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
+	{
+		ir.Write(IROp::FRound, fd, fs);
+		break;
+	}
+
+	case 13: //FsI(fd) = Rto0(F(fs)));            break; //trunc.w.s
+	{
+		ir.Write(IROp::FTrunc, fd, fs);
+		break;
+	}
+
+	case 14://FsI(fd) = (int)ceilf (F(fs));      break; //ceil.w.s
+	{
+		ir.Write(IROp::FCeil, fd, fs);
+		break;
+	}
+	case 15: //FsI(fd) = (int)floorf(F(fs));      break; //floor.w.s
+	{
+		ir.Write(IROp::FFloor, fd, fs);
+		break;
+	}
+
+	case 32: //F(fd)   = (float)FsI(fs);          break; //cvt.s.w
+		ir.Write(IROp::FCvtSW, fd, fs);
+		break;
+
+	case 36: //FsI(fd) = (int)  F(fs);            break; //cvt.w.s
+		ir.Write(IROp::FCvtWS, fd, fs);
+		break;
+
+	default:
+		DISABLE;
+	}
+}
+
+void IRJit::Comp_mxc1(MIPSOpcode op)
+{
+	CONDITIONAL_DISABLE;
+
+	int fs = _FS;
+	MIPSGPReg rt = _RT;
+
+	switch ((op >> 21) & 0x1f) {
+	case 0: // R(rt) = FI(fs); break; //mfc1
+		if (rt == MIPS_REG_ZERO) {
+			return;
+		}
+		gpr.MapDirty(rt);
+		ir.Write(IROp::FMovToGPR, rt, fs);
+		return;
+
+	case 2: //cfc1
+		if (rt == MIPS_REG_ZERO) {
+			return;
+		}
+		if (fs == 31) {
+			DISABLE;
+		} else if (fs == 0) {
+			gpr.SetImm(rt, MIPSState::FCR0_VALUE);
+		} else {
+			// Unsupported regs are always 0.
+			gpr.SetImm(rt, 0);
+		}
+		return;
+
+	case 4: //FI(fs) = R(rt);	break; //mtc1
+		gpr.MapDirty(rt);
+		ir.Write(IROp::FMovFromGPR, fs, rt);
+		return;
+
+	case 6: //ctc1
+		if (fs == 31) {
+			// Set rounding mode
+			DISABLE;
+		} else {
+			Comp_Generic(op);
+		}
+		return;
+	default:
+		DISABLE;
+		break;
+	}
+}
+
+}	// namespace MIPSComp
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
new file mode 100644
index 000000000000..53ea1f866fe9
--- /dev/null
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -0,0 +1,162 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+
+// Optimization ideas:
+//
+// It's common to see sequences of stores writing or reading to a contiguous set of
+// addresses in function prologues/epilogues:
+//  sw s5, 104(sp)
+//  sw s4, 100(sp)
+//  sw s3, 96(sp)
+//  sw s2, 92(sp)
+//  sw s1, 88(sp)
+//  sw s0, 84(sp)
+//  sw ra, 108(sp)
+//  mov s4, a0
+//  mov s3, a1
+//  ...
+// Such sequences could easily be detected and turned into nice contiguous
+// sequences of ARM stores instead of the current 3 instructions per sw/lw.
+//
+// Also, if we kept track of the likely register content of a cached register,
+// (pointer or data), we could avoid many BIC instructions.
+
+
+#include "Core/MemMap.h"
+#include "Core/Config.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSAnalyst.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+
+#define _RS MIPS_GET_RS(op)
+#define _RT MIPS_GET_RT(op)
+#define _RD MIPS_GET_RD(op)
+#define _FS MIPS_GET_FS(op)
+#define _FT MIPS_GET_FT(op)
+#define _FD MIPS_GET_FD(op)
+#define _SA MIPS_GET_SA(op)
+#define _POS  ((op>> 6) & 0x1F)
+#define _SIZE ((op>>11) & 0x1F)
+#define _IMM16 (signed short)(op & 0xFFFF)
+#define _IMM26 (op & 0x03FFFFFF)
+
+// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
+// Currently known non working ones should have DISABLE.
+
+// #define CONDITIONAL_DISABLE { Comp_Generic(op); return; }
+#define CONDITIONAL_DISABLE ;
+#define DISABLE { Comp_Generic(op); return; }
+
+namespace MIPSComp {
+	void IRJit::Comp_ITypeMemLR(MIPSOpcode op, bool load) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_ITypeMem(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
+
+		int offset = (signed short)(op & 0xFFFF);
+		bool load = false;
+		MIPSGPReg rt = _RT;
+		MIPSGPReg rs = _RS;
+		int o = op >> 26;
+		if (((op >> 29) & 1) == 0 && rt == MIPS_REG_ZERO) {
+			// Don't load anything into $zr
+			return;
+		}
+
+		u32 iaddr = gpr.IsImm(rs) ? offset + gpr.GetImm(rs) : 0xFFFFFFFF;
+		int addrReg = IRTEMP_0;
+		switch (o) {
+			// Load
+		case 35:
+			ir.Write(IROp::Load32, rt, rs, ir.AddConstant(offset));
+			break;
+		case 37:
+			ir.Write(IROp::Load16, rt, rs, ir.AddConstant(offset));
+			break;
+		case 33:
+			ir.Write(IROp::Load16Ext, rt, rs, ir.AddConstant(offset));
+			break;
+		case 36:
+			ir.Write(IROp::Load8, rt, rs, ir.AddConstant(offset));
+			break;
+		case 32:
+			ir.Write(IROp::Load8Ext, rt, rs, ir.AddConstant(offset));
+			break;
+			// Store
+		case 43:
+			ir.Write(IROp::Store32, rt, rs, ir.AddConstant(offset));
+			break;
+		case 41:
+			ir.Write(IROp::Store16, rt, rs, ir.AddConstant(offset));
+			break;
+		case 40:
+			ir.Write(IROp::Store8, rt, rs, ir.AddConstant(offset));
+			break;
+
+		case 34: //lwl
+		case 38: //lwr
+			load = true;
+		case 42: //swl
+		case 46: //swr
+			DISABLE;
+			break;
+		default:
+			Comp_Generic(op);
+			return;
+		}
+	}
+
+	void IRJit::Comp_Cache(MIPSOpcode op) {
+//		int imm = (s16)(op & 0xFFFF);
+//		int rs = _RS;
+//		int addr = R(rs) + imm;
+		int func = (op >> 16) & 0x1F;
+
+		// It appears that a cache line is 0x40 (64) bytes, loops in games
+		// issue the cache instruction at that interval.
+
+		// These codes might be PSP-specific, they don't match regular MIPS cache codes very well
+		switch (func) {
+			// Icache
+		case 8:
+			// Invalidate the instruction cache at this address
+			DISABLE;
+			break;
+			// Dcache
+		case 24:
+			// "Create Dirty Exclusive" - for avoiding a cacheline fill before writing to it.
+			// Will cause garbage on the real machine so we just ignore it, the app will overwrite the cacheline.
+			break;
+		case 25:  // Hit Invalidate - zaps the line if present in cache. Should not writeback???? scary.
+			// No need to do anything.
+			break;
+		case 27:  // D-cube. Hit Writeback Invalidate.  Tony Hawk Underground 2
+			break;
+		case 30:  // GTA LCS, a lot. Fill (prefetch).   Tony Hawk Underground 2
+			break;
+
+		default:
+			DISABLE;
+			break;
+		}
+	}
+}
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
new file mode 100644
index 000000000000..d7b807fe6347
--- /dev/null
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -0,0 +1,326 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <cmath>
+#include "math/math_util.h"
+
+#include "Core/MemMap.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSTables.h"
+#include "Core/MIPS/MIPSAnalyst.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Common/CPUDetect.h"
+#include "Core/Config.h"
+#include "Core/Reporting.h"
+
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+
+// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
+// Currently known non working ones should have DISABLE.
+
+// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
+#define CONDITIONAL_DISABLE ;
+#define DISABLE { Comp_Generic(op); return; }
+
+#define _RS MIPS_GET_RS(op)
+#define _RT MIPS_GET_RT(op)
+#define _RD MIPS_GET_RD(op)
+#define _FS MIPS_GET_FS(op)
+#define _FT MIPS_GET_FT(op)
+#define _FD MIPS_GET_FD(op)
+#define _SA MIPS_GET_SA(op)
+#define _POS  ((op>> 6) & 0x1F)
+#define _SIZE ((op>>11) & 0x1F)
+#define _IMM16 (signed short)(op & 0xFFFF)
+#define _IMM26 (op & 0x03FFFFFF)
+
+namespace MIPSComp {
+
+	void IRJit::Comp_VPFX(MIPSOpcode op)	{
+		CONDITIONAL_DISABLE;
+		int data = op & 0xFFFFF;
+		int regnum = (op >> 24) & 3;
+		switch (regnum) {
+		case 0:  // S
+			js.prefixS = data;
+			js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
+			break;
+		case 1:  // T
+			js.prefixT = data;
+			js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
+			break;
+		case 2:  // D
+			js.prefixD = data;
+			js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
+			break;
+		default:
+			ERROR_LOG(CPU, "VPFX - bad regnum %i : data=%08x", regnum, data);
+			break;
+		}
+	}
+
+	void IRJit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
+		if (prefix == 0xE4)
+			return;
+
+		int n = GetNumVectorElements(sz);
+		u8 origV[4];
+		static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
+
+		for (int i = 0; i < n; i++)
+			origV[i] = vregs[i];
+
+		for (int i = 0; i < n; i++) {
+			int regnum = (prefix >> (i * 2)) & 3;
+			int abs = (prefix >> (8 + i)) & 1;
+			int negate = (prefix >> (16 + i)) & 1;
+			int constants = (prefix >> (12 + i)) & 1;
+
+			// Unchanged, hurray.
+			if (!constants && regnum == i && !abs && !negate)
+				continue;
+
+			/*
+			// This puts the value into a temp reg, so we won't write the modified value back.
+			vregs[i] = fpr.GetTempV();
+			if (!constants) {
+				fpr.MapDirtyInV(vregs[i], origV[regnum]);
+				fpr.SpillLockV(vregs[i]);
+
+				// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
+				// TODO: But some ops seem to use const 0 instead?
+				if (regnum >= n) {
+					WARN_LOG(CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, GetCompilerPC(), MIPSDisasmAt(GetCompilerPC()));
+					regnum = 0;
+				}
+
+				if (abs) {
+					fp.FABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+					if (negate)
+						fp.FNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));
+				} else {
+					if (negate)
+						fp.FNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+					else
+						fp.FMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+				}
+			} else {
+				fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);
+				fpr.SpillLockV(vregs[i]);
+				fp.MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs << 2)], SCRATCH1, (bool)negate);
+			}
+			*/
+		}
+	}
+
+	void IRJit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
+
+		GetVectorRegs(regs, sz, vectorReg);
+		if (js.prefixD == 0)
+			return;
+
+		int n = GetNumVectorElements(sz);
+		for (int i = 0; i < n; i++) {
+			// Hopefully this is rare, we'll just write it into a reg we drop.
+			//if (js.VfpuWriteMask(i))
+			//	regs[i] = fpr.GetTempV();
+		}
+	}
+
+	void IRJit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
+		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
+		if (!js.prefixD)
+			return;
+
+		/*
+		int n = GetNumVectorElements(sz);
+		for (int i = 0; i < n; i++) {
+			if (js.VfpuWriteMask(i))
+				continue;
+
+			int sat = (js.prefixD >> (i * 2)) & 3;
+			if (sat == 1) {
+				// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
+				fpr.MapRegV(vregs[i], MAP_DIRTY);
+
+				fp.MOVI2F(S0, 0.0f, SCRATCH1);
+				fp.MOVI2F(S1, 1.0f, SCRATCH1);
+				fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
+				fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
+			} else if (sat == 3) {
+				// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]
+				fpr.MapRegV(vregs[i], MAP_DIRTY);
+
+				fp.MOVI2F(S0, -1.0f, SCRATCH1);
+				fp.MOVI2F(S1, 1.0f, SCRATCH1);
+				fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
+				fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
+			}
+		}
+		*/
+	}
+
+	void IRJit::Comp_SV(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_SVQ(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VVectorInit(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VIdt(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VMatrixInit(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VHdp(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	static const float MEMORY_ALIGNED16(vavg_table[4]) = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
+
+	void IRJit::Comp_Vhoriz(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VDot(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VecDo3(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VV2Op(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vi2f(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vh2f(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vf2i(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Mftv(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vmfvc(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vmtvc(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vmmov(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VScl(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vmmul(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vmscl(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vtfm(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VCrs(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VDet(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vi2x(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vx2i(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_VCrossQuat(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vcmp(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vcmov(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Viim(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vfim(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vcst(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
+	// calling the math library.
+	void IRJit::Comp_VRot(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vsgn(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vocp(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_ColorConv(MIPSOpcode op) {
+		DISABLE;
+	}
+
+	void IRJit::Comp_Vbfy(MIPSOpcode op) {
+		DISABLE;
+	}
+}
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
new file mode 100644
index 000000000000..cfcbe4349747
--- /dev/null
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -0,0 +1,316 @@
+#include "Core/MIPS/IR/IRInst.h"
+#include "Core/MemMap.h"
+
+IRMeta meta[] = {
+	{ IROp::SetConst, "SetConst", "GC" },
+	{ IROp::Add, "Add", "GGG" },
+	{ IROp::Sub, "Sub", "GGG" },
+	{ IROp::Neg, "Neg", "GG" },
+	{ IROp::Not, "Not", "GG" },
+	{ IROp::And, "And", "GGG" },
+	{ IROp::Or, "Or", "GGG" },
+	{ IROp::Xor, "Xor", "GGG" },
+	{ IROp::AddConst, "AddConst", "GGC" },
+	{ IROp::SubConst, "SubConst", "GGC" },
+	{ IROp::AndConst, "AndConst", "GGC" },
+	{ IROp::OrConst, "OrConst", "GGC" },
+	{ IROp::XorConst, "XorConst", "GGC" },
+	{ IROp::Shl, "Shl", "GGG" },
+	{ IROp::Shr, "Shr", "GGG" },
+	{ IROp::Sar, "Sar", "GGG" },
+	{ IROp::Ror, "Ror", "GGG" },
+	{ IROp::ShlImm, "ShlImm", "GGI" },
+	{ IROp::ShrImm, "ShrImm", "GGI" },
+	{ IROp::SarImm, "SarImm", "GGI" },
+	{ IROp::RorImm, "RorImm", "GGI" },
+	{ IROp::Slt, "Slt","GGC" },
+	{ IROp::SltConst, "SltConst","GGC" },
+	{ IROp::SltU, "SltU", "GGC" },
+	{ IROp::SltUConst, "SltUConst", "GGC" },
+	{ IROp::Clz, "Clz", "GG" },
+	{ IROp::MovZ, "MovZ", "GGG" },
+	{ IROp::MovNZ, "MovNZ", "GGG" },
+	{ IROp::Max, "Max", "GGG" },
+	{ IROp::Min, "Min", "GGG" },
+	{ IROp::BSwap16, "BSwap16", "GG" },
+	{ IROp::BSwap32, "BSwap32", "GG" },
+	{ IROp::Mul, "Mul", "_GG" },
+	{ IROp::Ext8to32, "Ext8to32", "GG" },
+	{ IROp::Ext16to32, "Ext16to32", "GG" },
+	{ IROp::FAdd, "FAdd", "FFF" },
+	{ IROp::FSub, "FSub", "FFF" },
+	{ IROp::FMul, "FMul", "FFF" },
+	{ IROp::FDiv, "FDiv", "FFF" },
+	{ IROp::FMov, "FMov", "FF" },
+	{ IROp::FSqrt, "FSqrt", "FF" },
+	{ IROp::FNeg, "FNeg", "FF" },
+	{ IROp::FAbs, "FAbs", "FF" },
+	{ IROp::FRound, "FRound", "FF" },
+	{ IROp::FTrunc, "FTrunc", "FF" },
+	{ IROp::FCeil, "FCeil", "FF" },
+	{ IROp::FFloor, "FFloor", "FF" },
+	{ IROp::FCvtWS, "FCvtWS", "FF" },
+	{ IROp::FCvtSW, "FCvtSW", "FF" },
+	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
+	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
+	{ IROp::FpCondToReg, "FpCondToReg", "G" },
+	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "T" },
+	{ IROp::Interpret, "Interpret", "_C" },
+	{ IROp::Downcount, "Downcount", "_II" },
+	{ IROp::Syscall, "Syscall", "_C"},
+	{ IROp::SetPC, "SetPC", "_C"},
+};
+
+const IRMeta *metaIndex[256];
+
+void InitIR() {
+	for (size_t i = 0; i < ARRAY_SIZE(meta); i++) {
+		metaIndex[(int)meta[i].op] = &meta[i];
+	}
+}
+
+u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
+	const IRInst *end = inst + count;
+	while (inst != end) {
+		switch (inst->op) {
+		case IROp::SetConst:
+			mips->r[inst->dest] = constPool[inst->src1];
+			break;
+		case IROp::Add:
+			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
+			break;
+		case IROp::Sub:
+			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
+			break;
+		case IROp::Neg:
+			mips->r[inst->dest] = -(s32)mips->r[inst->src1];
+			break;
+		case IROp::Ext8to32:
+			mips->r[inst->dest] = (s32)(s8)mips->r[inst->src1];
+			break;
+		case IROp::Ext16to32:
+			mips->r[inst->dest] = (s32)(s16)mips->r[inst->src1];
+			break;
+
+		case IROp::Load8:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load8Ext:
+			mips->r[inst->dest] = (s32)(s8)Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16Ext:
+			mips->r[inst->dest] = (s32)(s16)Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load32:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::Store8:
+			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store16:
+			Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store32:
+			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::ShlImm:
+			mips->r[inst->dest] = mips->r[inst->src1] << inst->src2;
+			break;
+		case IROp::ShrImm:
+			mips->r[inst->dest] = mips->r[inst->src1] >> inst->src2;
+			break;
+		case IROp::SarImm:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> inst->src2;
+			break;
+		case IROp::RorImm:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = inst->src2;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+			break;
+
+		case IROp::Shl:
+			mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Shr:
+			mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Sar:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Ror:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = mips->r[inst->src2] & 31;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+		break;
+
+		case IROp::MovZ:
+			if (mips->r[inst->src1] == 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+		case IROp::MovNZ:
+			if (mips->r[inst->src1] != 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+
+		case IROp::Max:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+		case IROp::Min:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+
+		case IROp::BSwap16:
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
+			break;
+		}
+		case IROp::BSwap32:
+			mips->r[inst->dest] = swap32(mips->r[inst->src1]);
+			break;
+
+		case IROp::FAdd:
+			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];
+			break;
+		case IROp::FSub:
+			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
+			break;
+		case IROp::FMul:
+			mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
+			break;
+		case IROp::FDiv:
+			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
+			break;
+
+		case IROp::FMov:
+			mips->f[inst->dest] = mips->f[inst->src1];
+			break;
+		case IROp::FAbs:
+			mips->f[inst->dest] = fabsf(mips->f[inst->src1]);
+			break;
+		case IROp::FSqrt:
+			mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);
+			break;
+		case IROp::FNeg:
+			mips->f[inst->dest] = -mips->f[inst->src1];
+			break;
+		case IROp::FpCondToReg:
+			mips->r[inst->dest] = mips->fpcond;
+			break;
+
+		case IROp::ExitToConst:
+			return constPool[inst->src1];
+
+		case IROp::ExitToReg:
+			return mips->r[inst->src1];
+
+		case IROp::ExitToConstIfEq:
+			if (mips->r[inst->src1] == mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfNeq:
+			if (mips->r[inst->src1] != mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGtZ:
+			if ((s32)mips->r[inst->src1] > 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGeZ:
+			if ((s32)mips->r[inst->src1] >= 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLtZ:
+			if ((s32)mips->r[inst->src1] < 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLeZ:
+			if ((s32)mips->r[inst->src1] <= 0)
+				return constPool[inst->dest];
+			break;
+
+		case IROp::SetPC:
+			return mips->pc = mips->r[inst->src1];
+
+		default:
+			Crash();
+		}
+		inst++;
+	}
+
+	// If we got here, the block was badly constructed.
+	// Crash();
+	return 0;
+}
+
+void IRWriter::Write(IROp op, u8 dst, u8 src1, u8 src2) {
+	IRInst inst;
+	inst.op = op;
+	inst.dest = dst;
+	inst.src1 = src1;
+	inst.src2 = src2;
+	insts_.push_back(inst);
+}
+
+void IRWriter::WriteSetConstant(u8 dst, u32 value) {
+	// TODO: Check for the fixed ones first.
+	Write(IROp::SetConstImm, AddConstant(value));
+}
+
+int IRWriter::AddConstant(u32 value) {
+	for (size_t i = 0; i < constPool_.size(); i++) {
+		if (constPool_[i] == value)
+			return i;
+	}
+	constPool_.push_back(value);
+	return (int)constPool_.size() - 1;
+}
+
+int IRWriter::AddConstantFloat(float value) {
+	u32 val;
+	memcpy(&val, &value, 4);
+	return AddConstant(val);
+}
+
+void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *constPool) {
+	switch (type) {
+	case 'G':
+		snprintf(buf, bufSize, "r%d", param);
+		break;
+	case 'F':
+		snprintf(buf, bufSize, "r%d", param);
+		break;
+	case 'C':
+		snprintf(buf, bufSize, "%08x", constPool[param]);
+		break;
+	default:
+		snprintf(buf, bufSize, "?");
+		break;
+	}
+}
+
+void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool) {
+	const IRMeta *meta = metaIndex[(int)inst.op];
+	char bufDst[16];
+	char bufSrc1[16];
+	char bufSrc2[16];
+	DisassembleParam(bufDst, sizeof(bufDst) - 2, inst.dest, meta->types[0], constPool);
+	DisassembleParam(bufSrc1, sizeof(bufSrc1) - 2, inst.dest, meta->types[1], constPool);
+	DisassembleParam(bufSrc2, sizeof(bufSrc2), inst.dest, meta->types[2], constPool);
+	if (meta->types[1]) {
+		strcat(bufDst, ", ");
+	}
+	if (meta->types[2]) {
+		strcat(bufSrc1, ", ");
+	}
+	snprintf(buf, bufsize, "%s %s%s%s", meta->name, bufDst, bufSrc1, bufSrc2);
+}
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
new file mode 100644
index 000000000000..a1aa75edc7cf
--- /dev/null
+++ b/Core/MIPS/IR/IRInst.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <vector>
+
+#include "Common/CommonTypes.h"
+#include "Core/MIPS/MIPS.h"
+
+// Basic IR
+//
+// This IR refers implicitly to the MIPS register set and is simple to interpret.
+// To do real compiler things with it and do full-function compilation, it probably
+// needs to be lifted to a higher IR first, before being lowered onto each target.
+// But this gets rid of a lot of MIPS idiosyncrasies that makes it tricky, like
+// delay slots, and is very suitable for translation into other IRs. Can of course
+// even be directly JIT-ed, but the gains will probably be tiny over our older direct
+// MIPS->target JITs.
+
+enum class IROp : u8 {
+	SetConst,
+	SetConstImm,
+	FSetConst,
+
+	Mov,
+
+	Add,
+	Sub,
+	Neg,
+	Not,
+
+	And,
+	Or,
+	Xor,
+
+	AddConst,
+	SubConst,
+
+	AndConst,
+	OrConst,
+	XorConst,
+
+	Shl,
+	Shr,
+	Sar,
+	Ror,
+
+	// The shift is stored directly, not in the const table, so Imm instead of Const
+	ShlImm,
+	ShrImm,
+	SarImm,
+	RorImm,
+		
+	Slt,
+	SltConst,
+	SltU,
+	SltUConst,
+
+	Clz,
+
+	// Conditional moves
+	MovZ,
+	MovNZ,
+
+	Max,
+	Min,
+
+	// Byte swaps. All CPUs have native ones so worth keeping.
+	BSwap16,  // Swaps both the high and low byte pairs.
+	BSwap32,
+
+	// Hi/Lo semantics preserved.
+	Mul,
+	MulU,
+	Madd,
+	MaddU,
+	Msub,
+	MsubU,
+	
+	// These take a constant from the pool as an offset.
+	// Loads from a constant address can be represented by using r0.
+	Load8,
+	Load8Ext,
+	Load16,
+	Load16Ext,
+	Load32,
+	LoadFloat,
+
+	Store8,
+	Store16,
+	Store32,
+	StoreFloat,
+
+	Ext8to32,
+	Ext16to32,
+
+	FAdd,
+	FSub,
+	FMul,
+	FDiv,
+
+	FMov,
+	FSqrt,
+	FNeg,
+	FAbs,
+
+	FRound,
+	FTrunc,
+	FCeil,
+	FFloor,
+
+	FCvtWS,
+	FCvtSW,
+
+	FMovFromGPR,
+	FMovToGPR,
+
+	FpCondToReg,
+	VfpCondToReg,
+
+	FCmpUnordered,
+	FCmpEqual,
+	FCmpEqualUnordered,
+	FCmpLessOrdered,
+	FCmpLessUnordered,
+	FCmpLessEqualOrdered,
+	FCmpLessEqualUnordered,
+
+	// Rounding Mode
+	RestoreRoundingMode,
+	ApplyRoundingMode,
+	UpdateRoundingMode,
+
+	SetCtrlVFPU,
+
+	// Fake/System instructions
+	Interpret,
+
+	// Emit this before you exits. Semantic is to set the downcount
+	// that will be used at the actual exit.
+	Downcount,  // src1 + (src2<<8)
+
+	// End-of-basic-block.
+	ExitToConst,   // 0, const, downcount
+	ExitToReg,
+	ExitToConstIfEq,  // const, reg1, reg2
+	ExitToConstIfNeq, // const, reg1, reg2
+	ExitToConstIfGtZ,  // const, reg1, 0
+	ExitToConstIfGeZ,  // const, reg1, 0
+	ExitToConstIfLtZ,  // const, reg1, 0
+	ExitToConstIfLeZ,  // const, reg1, 0
+
+	ExitToConstIfFpTrue,
+	ExitToConstIfFpFalse,
+
+	Syscall,
+	SetPC,  // hack to make syscall returns work
+	Break,
+};
+
+enum IRComparison {
+	Greater,
+	GreaterEqual,
+	Less,
+	LessEqual,
+	Equal,
+	NotEqual,
+	Bad,
+};
+
+// Hm, unused
+inline IRComparison Invert(IRComparison comp) {
+	switch (comp) {
+	case IRComparison::Equal: return IRComparison::NotEqual;
+	case IRComparison::NotEqual: return IRComparison::Equal;
+	case IRComparison::Greater: return IRComparison::LessEqual;
+	case IRComparison::GreaterEqual: return IRComparison::Less;
+	case IRComparison::Less: return IRComparison::GreaterEqual;
+	case IRComparison::LessEqual: return IRComparison::Greater;
+	default:
+		return IRComparison::Bad;
+	}
+}
+
+inline IROp ComparisonToExit(IRComparison comp) {
+	switch (comp) {
+	case IRComparison::Equal: return IROp::ExitToConstIfEq;
+	case IRComparison::NotEqual: return IROp::ExitToConstIfNeq;
+	case IRComparison::Greater: return IROp::ExitToConstIfGtZ;
+	case IRComparison::GreaterEqual: return IROp::ExitToConstIfGeZ;
+	case IRComparison::Less: return IROp::ExitToConstIfLtZ;
+	case IRComparison::LessEqual: return IROp::ExitToConstIfLeZ;
+	default:
+		return IROp::Break;
+	}
+}
+
+enum {
+	IRTEMP_0 = 192,
+	IRTEMP_1,
+	IRTEMP_2,
+	IRTEMP_3,
+
+	// Hacky way to get to other state
+	IRREG_LO = 226,  // offset of lo in MIPSState / 4
+	IRREG_HI = 227,
+};
+
+enum class IRParam {
+	Ignore = '_',
+	UImm8 = 'U',
+	Const = 'C',
+	GPR = 'G',
+	FPR = 'F',
+	VPR = 'V',
+	VCtrl = 'T',
+};
+
+struct IRMeta {
+	IROp op;
+	const char *name;
+	const char types[4];  // GGG  
+	u32 flags;
+};
+
+// 32 bits.
+struct IRInst {
+	IROp op;
+	union {
+		u8 dest;
+		u8 src3;
+	};
+	u8 src1;
+	u8 src2;
+};
+
+// Returns the new PC.
+u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count);
+
+// Each IR block gets a constant pool.
+class IRWriter {
+public:
+	void Write(IROp op, u8 dst = 0, u8 src1 = 0, u8 src2 = 0);
+	void WriteSetConstant(u8 dst, u32 value);
+
+	int AddConstant(u32 value);
+	int AddConstantFloat(float value);
+
+	void Clear() {
+		insts_.clear();
+		constPool_.clear();
+	}
+
+	const std::vector<IRInst> &GetInstructions() { return insts_; }
+	const std::vector<u32> &GetConstants() { return constPool_; }
+
+private:
+	std::vector<IRInst> insts_;
+	std::vector<u32> constPool_;
+};
+
+void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool);
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
new file mode 100644
index 000000000000..b16706b2d68e
--- /dev/null
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -0,0 +1,333 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "base/logging.h"
+#include "profiler/profiler.h"
+#include "Common/ChunkFile.h"
+#include "Common/CPUDetect.h"
+#include "Common/StringUtils.h"
+
+#include "Core/Reporting.h"
+#include "Core/Config.h"
+#include "Core/Core.h"
+#include "Core/CoreTiming.h"
+#include "Core/Debugger/SymbolMap.h"
+#include "Core/MemMap.h"
+
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/MIPSCodeUtils.h"
+#include "Core/MIPS/MIPSInt.h"
+#include "Core/MIPS/MIPSTables.h"
+#include "Core/HLE/ReplaceTables.h"
+#include "Core/HLE/sceKernelMemory.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"
+
+void DisassembleArm64Print(const u8 *data, int size) {
+	std::vector<std::string> lines = DisassembleArm64(data, size);
+	for (auto s : lines) {
+		ILOG("%s", s.c_str());
+	}
+	/*
+	ILOG("+++");
+	// A format friendly to Online Disassembler which gets endianness wrong
+	for (size_t i = 0; i < lines.size(); i++) {
+		uint32_t opcode = ((const uint32_t *)data)[i];
+		ILOG("%d/%d: %08x", (int)(i+1), (int)lines.size(), swap32(opcode));
+	}
+	ILOG("===");
+	ILOG("===");*/
+}
+
+namespace MIPSComp
+{
+
+IRJit::IRJit(MIPSState *mips) : gpr(), mips_(mips) { 
+	logBlocks = 0;
+	dontLogBlocks = 0;
+	js.startDefaultPrefix = mips_->HasDefaultPrefix();
+	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
+	u32 size = 128 * 1024;
+	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
+}
+
+IRJit::~IRJit() {
+}
+
+void IRJit::DoState(PointerWrap &p) {
+	auto s = p.Section("Jit", 1, 2);
+	if (!s)
+		return;
+
+	p.Do(js.startDefaultPrefix);
+	if (s >= 2) {
+		p.Do(js.hasSetRounding);
+		js.lastSetRounding = 0;
+	} else {
+		js.hasSetRounding = 1;
+	}
+
+	if (p.GetMode() == PointerWrap::MODE_READ) {
+		js.currentRoundingFunc = convertS0ToSCRATCH1[(mips_->fcr31) & 3];
+	}
+}
+
+// This is here so the savestate matches between jit and non-jit.
+void IRJit::DoDummyState(PointerWrap &p) {
+	auto s = p.Section("Jit", 1, 2);
+	if (!s)
+		return;
+
+	bool dummy = false;
+	p.Do(dummy);
+	if (s >= 2) {
+		dummy = true;
+		p.Do(dummy);
+	}
+}
+
+void IRJit::FlushAll() {
+	FlushPrefixV();
+}
+
+void IRJit::FlushPrefixV() {
+	if ((js.prefixSFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_SPREFIX, ir.AddConstant(js.prefixS));
+		js.prefixSFlag = (JitState::PrefixState) (js.prefixSFlag & ~JitState::PREFIX_DIRTY);
+	}
+
+	if ((js.prefixTFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_TPREFIX, ir.AddConstant(js.prefixT));
+		js.prefixTFlag = (JitState::PrefixState) (js.prefixTFlag & ~JitState::PREFIX_DIRTY);
+	}
+
+	if ((js.prefixDFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_DPREFIX, ir.AddConstant(js.prefixD));
+		js.prefixDFlag = (JitState::PrefixState) (js.prefixDFlag & ~JitState::PREFIX_DIRTY);
+	}
+}
+
+void IRJit::ClearCache() {
+	ILOG("ARM64Jit: Clearing the cache!");
+	blocks_.Clear();
+}
+
+void IRJit::InvalidateCache() {
+	blocks_.Clear();
+}
+
+void IRJit::InvalidateCacheAt(u32 em_address, int length) {
+	blocks_.InvalidateICache(em_address, length);
+}
+
+void IRJit::EatInstruction(MIPSOpcode op) {
+	MIPSInfo info = MIPSGetInfo(op);
+	if (info & DELAYSLOT) {
+		ERROR_LOG_REPORT_ONCE(ateDelaySlot, JIT, "Ate a branch op.");
+	}
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT_ONCE(ateInDelaySlot, JIT, "Ate an instruction inside a delay slot.");
+	}
+
+	js.numInstructions++;
+	js.compilerPC += 4;
+	js.downcountAmount += MIPSGetInstructionCycleEstimate(op);
+}
+
+void IRJit::CompileDelaySlot() {
+	js.inDelaySlot = true;
+	MIPSOpcode op = GetOffsetInstruction(1);
+	MIPSCompileOp(op, this);
+	js.inDelaySlot = false;
+}
+
+void IRJit::Compile(u32 em_address) {
+	PROFILE_THIS_SCOPE("jitc");
+
+	int block_num = blocks_.AllocateBlock(em_address);
+	IRBlock *b = blocks_.GetBlock(block_num);
+	DoJit(em_address, b);
+
+	bool cleanSlate = false;
+
+	if (js.hasSetRounding && !js.lastSetRounding) {
+		WARN_LOG(JIT, "Detected rounding mode usage, rebuilding jit with checks");
+		// Won't loop, since hasSetRounding is only ever set to 1.
+		js.lastSetRounding = js.hasSetRounding;
+		cleanSlate = true;
+	}
+
+	// Drat.  The VFPU hit an uneaten prefix at the end of a block.
+	if (js.startDefaultPrefix && js.MayHavePrefix()) {
+		WARN_LOG(JIT, "An uneaten prefix at end of block: %08x", GetCompilerPC() - 4);
+		js.LogPrefix();
+
+		// Let's try that one more time.  We won't get back here because we toggled the value.
+		js.startDefaultPrefix = false;
+		// TODO ARM64: This crashes.
+		//cleanSlate = true;
+	}
+
+	if (cleanSlate) {
+		// Our assumptions are all wrong so it's clean-slate time.
+		ClearCache();
+		Compile(em_address);
+	}
+}
+
+void IRJit::RunLoopUntil(u64 globalticks) {
+	PROFILE_THIS_SCOPE("jit");
+	((void (*)())enterDispatcher)();
+}
+
+u32 IRJit::GetCompilerPC() {
+	return js.compilerPC;
+}
+
+MIPSOpcode IRJit::GetOffsetInstruction(int offset) {
+	return Memory::Read_Instruction(GetCompilerPC() + 4 * offset);
+}
+
+void IRJit::DoJit(u32 em_address, IRBlock *b) {
+	js.cancel = false;
+	js.blockStart = mips_->pc;
+	js.compilerPC = mips_->pc;
+	js.lastContinuedPC = 0;
+	js.initialBlockSize = 0;
+	js.nextExit = 0;
+	js.downcountAmount = 0;
+	js.curBlock = nullptr;
+	js.compiling = true;
+	js.inDelaySlot = false;
+	js.PrefixStart();
+	ir.Clear();
+
+	gpr.Start(&ir);
+
+	int partialFlushOffset = 0;
+
+	js.numInstructions = 0;
+	while (js.compiling) {
+		MIPSOpcode inst = Memory::Read_Opcode_JIT(GetCompilerPC());
+		js.downcountAmount += MIPSGetInstructionCycleEstimate(inst);
+		MIPSCompileOp(inst, this);
+		js.compilerPC += 4;
+		js.numInstructions++;
+	}
+
+	b->SetInstructions(ir.GetInstructions(), ir.GetConstants());
+
+	char temp[256];
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		ILOG("=============== mips %d ===============", blocks_.GetNumBlocks());
+		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
+			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp, true);
+			ILOG("M: %08x   %s", cpc, temp);
+		}
+	}
+
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		ILOG("=============== IR (%d instructions) ===============", js.numInstructions);
+		for (int i = 0; i < js.numInstructions; i++) {
+			char buf[256];
+			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
+			ILOG("%s", buf);
+		}
+	}
+
+	if (logBlocks > 0)
+		logBlocks--;
+	if (dontLogBlocks > 0)
+		dontLogBlocks--;
+}
+
+bool IRJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
+	// Used in disassembly viewer.
+	return false;
+}
+
+void IRJit::Comp_RunBlock(MIPSOpcode op) {
+	// This shouldn't be necessary, the dispatcher should catch us before we get here.
+	ERROR_LOG(JIT, "Comp_RunBlock should never be reached!");
+}
+
+void IRJit::LinkBlock(u8 *exitPoint, const u8 *checkedEntry) {
+	Crash();
+}
+
+void IRJit::UnlinkBlock(u8 *checkedEntry, u32 originalAddress) {
+	Crash();
+}
+
+bool IRJit::ReplaceJalTo(u32 dest) {
+	Crash();
+	return false;
+}
+
+void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
+	Crash();
+}
+
+void IRJit::Comp_Generic(MIPSOpcode op) {
+	ir.Write(IROp::Interpret, ir.AddConstant(op.encoding));
+	const MIPSInfo info = MIPSGetInfo(op);
+	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) {
+		// If it does eat them, it'll happen in MIPSCompileOp().
+		if ((info & OUT_EAT_PREFIX) == 0)
+			js.PrefixUnknown();
+	}
+}
+
+// Destroys SCRATCH2
+void IRJit::RestoreRoundingMode(bool force) {
+	// If the game has never set an interesting rounding mode, we can safely skip this.
+	if (force || js.hasSetRounding) {
+		ir.Write(IROp::RestoreRoundingMode);
+	}
+}
+
+// Destroys SCRATCH1 and SCRATCH2
+void IRJit::ApplyRoundingMode(bool force) {
+	// If the game has never set an interesting rounding mode, we can safely skip this.
+	if (force || js.hasSetRounding) {
+		ir.Write(IROp::ApplyRoundingMode);
+	}
+}
+
+// Destroys SCRATCH1 and SCRATCH2
+void IRJit::UpdateRoundingMode() {
+	ir.Write(IROp::UpdateRoundingMode);
+}
+
+void IRJit::Comp_DoNothing(MIPSOpcode op) { 
+}
+
+int IRJit::Replace_fabsf() {
+	Crash();
+	return 0;
+}
+
+void IRBlockCache::Clear() {
+	blocks_.clear();
+}
+
+void IRBlockCache::InvalidateICache(u32 addess, u32 length) {
+	// TODO
+}
+
+}  // namespace MIPSComp
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
new file mode 100644
index 000000000000..686eefe6c274
--- /dev/null
+++ b/Core/MIPS/IR/IRJit.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "Common/CPUDetect.h"
+#include "Core/MIPS/JitCommon/JitState.h"
+#include "Core/MIPS/JitCommon/JitBlockCache.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Core/MIPS/IR/IRInst.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
+
+#ifndef offsetof
+#include "stddef.h"
+#endif
+
+namespace MIPSComp {
+
+// TODO : Use arena allocators. For now let's just malloc.
+class IRBlock {
+public:
+	IRBlock() {}
+	IRBlock(u32 emAddr) : instr_(nullptr), const_(nullptr), origAddr_(emAddr), numInstructions_(0) {}
+	~IRBlock() {
+		delete[] instr_;
+		delete[] const_;
+	}
+
+	void SetInstructions(const std::vector<IRInst> &inst, const std::vector<u32> &constants) {
+		instr_ = new IRInst[inst.size()];
+		numInstructions_ = (u16)inst.size();
+		memcpy(instr_, inst.data(), sizeof(IRInst) * inst.size());
+		const_ = new u32[constants.size()];
+		numConstants_ = (u16)constants.size();
+		memcpy(const_, constants.data(), sizeof(u32) * constants.size());
+	}
+
+private:
+	IRInst *instr_;
+	u32 *const_;
+	u16 numInstructions_;
+	u16 numConstants_;
+	u32 origAddr_;
+};
+
+class IRBlockCache {
+public:
+	void Clear();
+	void InvalidateICache(u32 addess, u32 length);
+	int GetNumBlocks() const { return (int)blocks_.size(); }
+	int AllocateBlock(int emAddr) {
+		blocks_.emplace_back(IRBlock(emAddr));
+		return (int)blocks_.size() - 1;
+	}
+	IRBlock *GetBlock(int i) {
+		return &blocks_[i];
+	}
+private:
+	std::vector<IRBlock> blocks_;
+};
+
+class IRJit : public JitInterface {
+public:
+	IRJit(MIPSState *mips);
+	virtual ~IRJit();
+
+	void DoState(PointerWrap &p) override;
+	void DoDummyState(PointerWrap &p) override;
+
+	const JitOptions &GetJitOptions() { return jo; }
+
+	// Compiled ops should ignore delay slots
+	// the compiler will take care of them by itself
+	// OR NOT
+	void Comp_Generic(MIPSOpcode op) override;
+
+	void RunLoopUntil(u64 globalticks) override;
+
+	void Compile(u32 em_address) override;	// Compiles a block at current MIPS PC
+	void DoJit(u32 em_address, IRBlock *b);
+
+	bool DescribeCodePtr(const u8 *ptr, std::string &name) override;
+
+	void Comp_RunBlock(MIPSOpcode op) override;
+	void Comp_ReplacementFunc(MIPSOpcode op) override;
+
+	// Ops
+	void Comp_ITypeMem(MIPSOpcode op) override;
+	void Comp_Cache(MIPSOpcode op) override;
+
+	void Comp_RelBranch(MIPSOpcode op) override;
+	void Comp_RelBranchRI(MIPSOpcode op) override;
+	void Comp_FPUBranch(MIPSOpcode op) override;
+	void Comp_FPULS(MIPSOpcode op) override;
+	void Comp_FPUComp(MIPSOpcode op) override;
+	void Comp_Jump(MIPSOpcode op) override;
+	void Comp_JumpReg(MIPSOpcode op) override;
+	void Comp_Syscall(MIPSOpcode op) override;
+	void Comp_Break(MIPSOpcode op) override;
+
+	void Comp_IType(MIPSOpcode op) override;
+	void Comp_RType2(MIPSOpcode op) override;
+	void Comp_RType3(MIPSOpcode op) override;
+	void Comp_ShiftType(MIPSOpcode op) override;
+	void Comp_Allegrex(MIPSOpcode op) override;
+	void Comp_Allegrex2(MIPSOpcode op) override;
+	void Comp_VBranch(MIPSOpcode op) override;
+	void Comp_MulDivType(MIPSOpcode op) override;
+	void Comp_Special3(MIPSOpcode op) override;
+
+	void Comp_FPU3op(MIPSOpcode op) override;
+	void Comp_FPU2op(MIPSOpcode op) override;
+	void Comp_mxc1(MIPSOpcode op) override;
+
+	void Comp_DoNothing(MIPSOpcode op) override;
+
+	void Comp_SV(MIPSOpcode op) override;
+	void Comp_SVQ(MIPSOpcode op) override;
+	void Comp_VPFX(MIPSOpcode op) override;
+	void Comp_VVectorInit(MIPSOpcode op) override;
+	void Comp_VMatrixInit(MIPSOpcode op) override;
+	void Comp_VDot(MIPSOpcode op) override;
+	void Comp_VecDo3(MIPSOpcode op) override;
+	void Comp_VV2Op(MIPSOpcode op) override;
+	void Comp_Mftv(MIPSOpcode op) override;
+	void Comp_Vmfvc(MIPSOpcode op) override;
+	void Comp_Vmtvc(MIPSOpcode op) override;
+	void Comp_Vmmov(MIPSOpcode op) override;
+	void Comp_VScl(MIPSOpcode op) override;
+	void Comp_Vmmul(MIPSOpcode op) override;
+	void Comp_Vmscl(MIPSOpcode op) override;
+	void Comp_Vtfm(MIPSOpcode op) override;
+	void Comp_VHdp(MIPSOpcode op) override;
+	void Comp_VCrs(MIPSOpcode op) override;
+	void Comp_VDet(MIPSOpcode op) override;
+	void Comp_Vi2x(MIPSOpcode op) override;
+	void Comp_Vx2i(MIPSOpcode op) override;
+	void Comp_Vf2i(MIPSOpcode op) override;
+	void Comp_Vi2f(MIPSOpcode op) override;
+	void Comp_Vh2f(MIPSOpcode op) override;
+	void Comp_Vcst(MIPSOpcode op) override;
+	void Comp_Vhoriz(MIPSOpcode op) override;
+	void Comp_VRot(MIPSOpcode op) override;
+	void Comp_VIdt(MIPSOpcode op) override;
+	void Comp_Vcmp(MIPSOpcode op) override;
+	void Comp_Vcmov(MIPSOpcode op) override;
+	void Comp_Viim(MIPSOpcode op) override;
+	void Comp_Vfim(MIPSOpcode op) override;
+	void Comp_VCrossQuat(MIPSOpcode op) override;
+	void Comp_Vsgn(MIPSOpcode op) override;
+	void Comp_Vocp(MIPSOpcode op) override;
+	void Comp_ColorConv(MIPSOpcode op) override;
+	void Comp_Vbfy(MIPSOpcode op) override;
+
+	int Replace_fabsf();
+
+	// Not using a regular block cache.
+	JitBlockCache *GetBlockCache() { return nullptr; }
+
+	void ClearCache();
+	void InvalidateCache();
+	void InvalidateCacheAt(u32 em_address, int length = 4);
+
+	void EatPrefix() { js.EatPrefix(); }
+
+	const u8 *GetDispatcher() const override {
+		return dispatcher;
+	}
+
+	void LinkBlock(u8 *exitPoint, const u8 *checkedEntry) override;
+	void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) override;
+
+private:
+	void FlushAll();
+	void FlushPrefixV();
+
+	u32 GetCompilerPC();
+	void CompileDelaySlot();
+	void EatInstruction(MIPSOpcode op);
+	MIPSOpcode GetOffsetInstruction(int offset);
+
+	void RestoreRoundingMode(bool force = false);
+	void ApplyRoundingMode(bool force = false);
+	void UpdateRoundingMode();
+
+	bool ReplaceJalTo(u32 dest);
+
+	// Utility compilation functions
+	void BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely);
+	void BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely);
+	void BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely);
+	void BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely);
+
+	// Utilities to reduce duplicated code
+	void CompImmLogic(MIPSGPReg rs, MIPSGPReg rt, u32 uimm, IROp op);
+	void CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp constOp, bool symmetric = false);
+	void CompShiftImm(MIPSOpcode op, IROp shiftType, int sa);
+	void CompShiftVar(MIPSOpcode op, IROp shiftType, IROp shiftTypeConst);
+
+	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
+	void ApplyPrefixD(const u8 *vregs, VectorSize sz);
+	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
+		GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixS, sz);
+	}
+	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
+		GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixT, sz);
+	}
+	void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg);
+
+	// Utils
+	void Comp_ITypeMemLR(MIPSOpcode op, bool load);
+
+	JitOptions jo;
+	JitState js;
+
+	IRBlockCache blocks_;
+
+	IRRegCache gpr;
+	// Arm64RegCacheFPU fpr;
+
+	MIPSState *mips_;
+
+	int dontLogBlocks;
+	int logBlocks;
+
+	IRWriter ir;
+
+	// where to write branch-likely trampolines
+	u32 blTrampolines_;
+	int blTrampolineCount_;
+
+public:
+	// Code pointers
+	const u8 *enterDispatcher;
+
+	const u8 *outerLoop;
+	const u8 *outerLoopPCInSCRATCH1;
+	const u8 *dispatcherCheckCoreState;
+	const u8 *dispatcherPCInSCRATCH1;
+	const u8 *dispatcher;
+	const u8 *dispatcherNoCheck;
+
+	const u8 *breakpointBailout;
+
+	const u8 *saveStaticRegisters;
+	const u8 *loadStaticRegisters;
+
+	const u8 *restoreRoundingMode;
+	const u8 *applyRoundingMode;
+	const u8 *updateRoundingMode;
+
+	// Indexed by FPCR FZ:RN bits for convenience.  Uses SCRATCH2.
+	const u8 *convertS0ToSCRATCH1[8];
+};
+
+}	// namespace MIPSComp
+
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
new file mode 100644
index 000000000000..7a31a463e4e5
--- /dev/null
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -0,0 +1,46 @@
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Core/MIPS/IR/IRInst.h"
+
+void IRRegCache::Dirty(MIPSGPReg rd) {
+	if (rd == 0) {
+		return;
+	}
+	if (reg_[rd].isImm) {
+		ir_->WriteSetConstant(rd, reg_[rd].immVal);
+		reg_[rd].isImm = false;
+	}
+}
+
+void IRRegCache::MapIn(MIPSGPReg rd) {
+	Dirty(rd);
+}
+
+void IRRegCache::MapInIn(MIPSGPReg rs, MIPSGPReg rt) {
+	Dirty(rs);
+	Dirty(rt);
+}
+
+void IRRegCache::MapDirty(MIPSGPReg rd) {
+	Dirty(rd);
+}
+
+void IRRegCache::MapDirtyIn(MIPSGPReg rd, MIPSGPReg rs) {
+	Dirty(rd);
+	Dirty(rs);
+}
+
+void IRRegCache::MapDirtyInIn(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt) {
+	Dirty(rd);
+	Dirty(rs);
+	Dirty(rt);
+}
+
+void IRRegCache::Start(IRWriter *ir) {
+	memset(&reg_, 0, sizeof(reg_));
+	reg_[0].isImm = true;
+	ir_ = ir;
+}
+
+void IRRegCache::FlushAll() {
+
+}
diff --git a/Core/MIPS/IR/IRRegCache.h b/Core/MIPS/IR/IRRegCache.h
new file mode 100644
index 000000000000..bf53e2a818f0
--- /dev/null
+++ b/Core/MIPS/IR/IRRegCache.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// IRRegCache is only to perform pre-constant folding. This is worth it to get cleaner
+// IR.
+
+#include "Common/CommonTypes.h"
+#include "Core/MIPS/MIPS.h"
+
+enum {
+	TOTAL_MAPPABLE_MIPSREGS = 256,
+};
+
+struct RegIR {
+	bool isImm;
+	u32 immVal;
+};
+
+class IRWriter;
+
+class IRRegCache {
+public:
+	void SetImm(MIPSGPReg r, u32 immVal) {
+		reg_[r].isImm = true;
+		reg_[r].immVal = immVal;
+	}
+
+	bool IsImm(MIPSGPReg r) const { return reg_[r].isImm; }
+	u32 GetImm(MIPSGPReg r) const { return reg_[r].immVal; }
+
+	void MapIn(MIPSGPReg rd);
+	void MapInIn(MIPSGPReg rs, MIPSGPReg rt);
+	void MapDirty(MIPSGPReg rd);
+	void MapDirtyIn(MIPSGPReg rd, MIPSGPReg rs);
+	void MapDirtyInIn(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt);
+
+	void Start(IRWriter *ir);
+	void FlushAll();
+
+private:
+	void Dirty(MIPSGPReg rd);
+	RegIR reg_[TOTAL_MAPPABLE_MIPSREGS];
+	IRWriter *ir_;
+};
diff --git a/Core/MIPS/JitCommon/JitCommon.cpp b/Core/MIPS/JitCommon/JitCommon.cpp
index 91ed4c10d625..0b7fc1932244 100644
--- a/Core/MIPS/JitCommon/JitCommon.cpp
+++ b/Core/MIPS/JitCommon/JitCommon.cpp
@@ -22,9 +22,11 @@
 
 #include "Common/StringUtils.h"
 #include "Core/Util/DisArm64.h"
+#include "Core/Config.h"
 
 #include "Core/MIPS/JitCommon/JitCommon.h"
 #include "Core/MIPS/JitCommon/JitState.h"
+#include "Core/MIPS/IR/IRJit.h"
 
 #if defined(ARM)
 #include "../ARM/ArmJit.h"
@@ -45,17 +47,21 @@ namespace MIPSComp {
 	}
 
 	JitInterface *CreateNativeJit(MIPSState *mips) {
+		if (false && g_Config.iCpuCore == (int)CPUCore::CPU_JIT) {
 #if defined(ARM)
-		return new MIPSComp::ArmJit(mips);
+			return new MIPSComp::ArmJit(mips);
 #elif defined(ARM64)
-		return new MIPSComp::Arm64Jit(mips);
+			return new MIPSComp::IRJit(mips);
 #elif defined(_M_IX86) || defined(_M_X64)
-		return new MIPSComp::Jit(mips);
+			return new MIPSComp::Jit(mips);
 #elif defined(MIPS)
-		return new MIPSComp::MipsJit(mips);
+			return new MIPSComp::MipsJit(mips);
 #else
-		return new MIPSComp::FakeJit(mips);
+			return new MIPSComp::FakeJit(mips);
 #endif
+		} else if (true || g_Config.iCpuCore == (int)CPUCore::CPU_IRJIT) {
+			return new MIPSComp::IRJit(mips);
+		}
 	}
 
 }
diff --git a/Core/MIPS/MIPS.h b/Core/MIPS/MIPS.h
index a24c2f3d8b81..bbc9952c4dc1 100644
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@@ -166,6 +166,10 @@ class MIPSState
 		float v[128];
 		u32 vi[128];
 	};
+	// Used for temporary variables by IR Interpreter.
+	// Can be indexed through r[] using indices 192+.
+	u32 t[16];
+
 	// Temps don't get flushed so we don't reserve space for them.
 	// If vfpuCtrl (prefixes) get mysterious values, check the VFPU regcache code.
 	u32 vfpuCtrl[16];
@@ -177,7 +181,7 @@ class MIPSState
 		struct {
 			u32 pc;
 
-			u32 lo;
+			u32 lo;  // offset 192 + 16 + 16 + 1 + 1
 			u32 hi;
 
 			u32 fcr31; //fpu control register

From 4acf85aa06ffbfdc54a936993aa4e310b0a75367 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 17:37:19 +0200
Subject: [PATCH 02/77] It's kind of starting to run

---
 Core/Core.vcxproj                |   2 +
 Core/Core.vcxproj.filters        |   6 ++
 Core/MIPS/IR/IRCompALU.cpp       |  39 +++++----
 Core/MIPS/IR/IRCompBranch.cpp    |  42 ++++++---
 Core/MIPS/IR/IRCompFPU.cpp       |   2 +-
 Core/MIPS/IR/IRCompLoadStore.cpp |   3 +-
 Core/MIPS/IR/IRInst.cpp          | 146 +++++++++++++++++++++++++++----
 Core/MIPS/IR/IRInst.h            |   6 +-
 Core/MIPS/IR/IRJit.cpp           |  77 +++++++++++-----
 Core/MIPS/IR/IRJit.h             |  23 ++++-
 Core/MIPS/IR/IRPassSimplify.cpp  |  14 +++
 Core/MIPS/IR/IRPassSimplify.h    |   5 ++
 Core/MIPS/IR/IRRegCache.cpp      |   4 +-
 Core/MIPS/JitCommon/JitCommon.h  |   1 +
 Core/MIPS/x86/Jit.cpp            |  10 +++
 Core/MIPS/x86/Jit.h              |   1 +
 Core/MemMap.cpp                  |   8 +-
 17 files changed, 309 insertions(+), 80 deletions(-)
 create mode 100644 Core/MIPS/IR/IRPassSimplify.cpp
 create mode 100644 Core/MIPS/IR/IRPassSimplify.h

diff --git a/Core/Core.vcxproj b/Core/Core.vcxproj
index af4b459fb1b9..e902adf7332d 100644
--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@@ -189,6 +189,7 @@
     <ClCompile Include="MIPS\IR\IRCompVFPU.cpp" />
     <ClCompile Include="MIPS\IR\IRInst.cpp" />
     <ClCompile Include="MIPS\IR\IRJit.cpp" />
+    <ClCompile Include="MIPS\IR\IRPassSimplify.cpp" />
     <ClCompile Include="MIPS\IR\IRRegCache.cpp" />
     <ClCompile Include="TextureReplacer.cpp" />
     <ClCompile Include="Compatibility.cpp" />
@@ -518,6 +519,7 @@
     <ClInclude Include="..\ext\udis86\udis86.h" />
     <ClInclude Include="MIPS\IR\IRInst.h" />
     <ClInclude Include="MIPS\IR\IRJit.h" />
+    <ClInclude Include="MIPS\IR\IRPassSimplify.h" />
     <ClInclude Include="MIPS\IR\IRRegCache.h" />
     <ClInclude Include="TextureReplacer.h" />
     <ClInclude Include="Compatibility.h" />
diff --git a/Core/Core.vcxproj.filters b/Core/Core.vcxproj.filters
index 99af2a2696fe..5905d62de115 100644
--- a/Core/Core.vcxproj.filters
+++ b/Core/Core.vcxproj.filters
@@ -664,6 +664,9 @@
     <ClCompile Include="MIPS\IR\IRInst.cpp">
       <Filter>MIPS\IR</Filter>
     </ClCompile>
+    <ClCompile Include="MIPS\IR\IRPassSimplify.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ELF\ElfReader.h">
@@ -1218,6 +1221,9 @@
     <ClInclude Include="MIPS\IR\IRInst.h">
       <Filter>MIPS\IR</Filter>
     </ClInclude>
+    <ClInclude Include="MIPS\IR\IRPassSimplify.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="CMakeLists.txt" />
diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 67059e371e5e..69cf25de5604 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -49,15 +49,15 @@ namespace MIPSComp {
 void IRJit::CompImmLogic(MIPSGPReg rs, MIPSGPReg rt, u32 uimm, IROp OP) {
 	if (gpr.IsImm(rs)) {
 		switch (OP) {
-		case IROp::AddConst: gpr.SetImm(rt, rs + uimm); break;
-		case IROp::SubConst: gpr.SetImm(rt, rs - uimm); break;
-		case IROp::AndConst: gpr.SetImm(rt, rs & uimm); break;
-		case IROp::OrConst: gpr.SetImm(rt, rs | uimm); break;
-		case IROp::XorConst: gpr.SetImm(rt, rs ^ uimm); break;
+		case IROp::AddConst: gpr.SetImm(rt, gpr.GetImm(rs) + uimm); break;
+		case IROp::SubConst: gpr.SetImm(rt, gpr.GetImm(rs) - uimm); break;
+		case IROp::AndConst: gpr.SetImm(rt, gpr.GetImm(rs) & uimm); break;
+		case IROp::OrConst: gpr.SetImm(rt, gpr.GetImm(rs) | uimm); break;
+		case IROp::XorConst: gpr.SetImm(rt, gpr.GetImm(rs) ^ uimm); break;
 		}
 	} else {
 		gpr.MapDirtyIn(rt, rs);
-		ir.Write(OP, rt, ir.AddConstant(uimm));
+		ir.Write(OP, rt, rs, ir.AddConstant(uimm));
 	}
 }
 
@@ -95,8 +95,7 @@ void IRJit::Comp_IType(MIPSOpcode op) {
 			break;
 		}
 		gpr.MapDirtyIn(rt, rs);
-		// Grab the sign bit (< 0) as 1/0.  Slightly faster than a shift.
-		ir.Write(IROp::Slt, rt, rs, ir.AddConstant(simm));
+		ir.Write(IROp::SltConst, rt, rs, ir.AddConstant(simm));
 		break;
 
 	case 11: // R(rt) = R(rs) < suimm; break; //sltiu
@@ -105,7 +104,7 @@ void IRJit::Comp_IType(MIPSOpcode op) {
 			break;
 		}
 		gpr.MapDirtyIn(rt, rs);
-		ir.Write(IROp::SltU, rt, rs, ir.AddConstant(suimm));
+		ir.Write(IROp::SltUConst, rt, rs, ir.AddConstant(suimm));
 		break;
 
 	case 15: // R(rt) = uimm << 16;	 //lui
@@ -167,6 +166,7 @@ void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp co
 			// Luckily, it was just an imm.
 			gpr.SetImm(rhs, rhsImm);
 		}
+		return;
 	}
 
 	// Can't do the RSB optimization on ARM64 - no RSB!
@@ -220,10 +220,17 @@ void IRJit::Comp_RType3(MIPSOpcode op) {
 	case 39: // R(rd) = ~(R(rs) | R(rt));       break; //nor
 		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
 			gpr.SetImm(rd, ~(gpr.GetImm(rs) | gpr.GetImm(rt)));
-		} 
-
-		ir.Write(IROp::Or, IRTEMP_0, rs, rt);
-		ir.Write(IROp::Not, rd, IRTEMP_0);
+		} else {
+			gpr.MapDirtyInIn(rd, rs, rt);
+			if (rs == 0) {
+				ir.Write(IROp::Not, rd, rt);
+			} else if (rt == 0) {
+				ir.Write(IROp::Not, rd, rs);
+			} else {
+				ir.Write(IROp::Or, IRTEMP_0, rs, rt);
+				ir.Write(IROp::Not, rd, IRTEMP_0);
+			}
+		}
 		break;
 
 	case 42: //R(rd) = (int)R(rs) < (int)R(rt); break; //slt
@@ -323,9 +330,9 @@ void IRJit::Comp_ShiftType(MIPSOpcode op) {
 
 	// WARNING : ROTR
 	switch (op & 0x3f) {
-	case 0: CompShiftImm(op, IROp::Shl, sa); break; //sll
-	case 2: CompShiftImm(op, rs == 1 ? IROp::Ror : IROp::Shr, sa); break;	//srl
-	case 3: CompShiftImm(op, IROp::Sar, sa); break; //sra
+	case 0: CompShiftImm(op, IROp::ShlImm, sa); break; //sll
+	case 2: CompShiftImm(op, (rs == 1 ? IROp::RorImm : IROp::ShrImm), sa); break;	//srl
+	case 3: CompShiftImm(op, IROp::SarImm, sa); break; //sra
 	case 4: CompShiftVar(op, IROp::Shl, IROp::ShlImm); break; //sllv
 	case 6: CompShiftVar(op, (fd == 1 ? IROp::Ror : IROp::Shr), (fd == 1 ? IROp::RorImm : IROp::ShrImm)); break; //srlv
 	case 7: CompShiftVar(op, IROp::Sar, IROp::SarImm); break; //srav
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 16c7245b82dc..7d01d0b685da 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -72,22 +72,28 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 
 	MIPSGPReg lhs = rs;
 	MIPSGPReg rhs = rt;
-	if (!delaySlotIsNice) {
-		ir.Write(IROp::Mov, IRTEMP_0, rs);
-		ir.Write(IROp::Mov, IRTEMP_1, rt);
-		lhs = (MIPSGPReg)IRTEMP_0;
-		rhs = (MIPSGPReg)IRTEMP_1;
+	if (!delaySlotIsNice && !likely) {  // if likely, we don't need this
+		if (rs != 0) {
+			ir.Write(IROp::Mov, IRTEMP_0, rs);
+			lhs = (MIPSGPReg)IRTEMP_0;
+		}
+		if (rt != 0) {
+			ir.Write(IROp::Mov, IRTEMP_1, rt);
+			rhs = (MIPSGPReg)IRTEMP_1;
+		}
 	}
 
 	if (!likely)
 		CompileDelaySlot();
 
 	gpr.MapInIn(lhs, rhs);
+	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs, rhs);
 	// This makes the block "impure" :(
 	if (likely)
 		CompileDelaySlot();
 
+	FlushAll();
 	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
 
 	js.compiling = false;
@@ -105,19 +111,25 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
 
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
 	if (!likely && delaySlotIsNice)
 		CompileDelaySlot();
 	int lhs = rs;
 	gpr.MapIn(rs);
-	if (!delaySlotIsNice) {
+	if (!delaySlotIsNice && !likely) {  // if likely, we don't need this
 		ir.Write(IROp::Mov, IRTEMP_0, rs);
 		lhs = IRTEMP_0;
 	}
+	if (andLink)
+		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
+	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs);
 	if (likely) {
 		CompileDelaySlot();
 	}
 	// Taken
+	FlushAll();
 	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
 	js.compiling = false;
 }
@@ -173,12 +185,15 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (!likely)
 		CompileDelaySlot();
 
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
 	FlushAll();
 	// Not taken
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), IRTEMP_0, 0);
 	// Taken
 	if (likely)
 		CompileDelaySlot();
+	FlushAll();
 	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
 	js.compiling = false;
 }
@@ -208,6 +223,8 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 
 	ir.Write(IROp::VfpCondToReg, IRTEMP_0);
 
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
 	// Sometimes there's a VFPU branch in a delay slot (Disgaea 2: Dark Hero Days, Zettai Hero Project, La Pucelle)
 	// The behavior is undefined - the CPU may take the second branch even if the first one passes.
 	// However, it does consistently try each branch, which these games seem to expect.
@@ -223,12 +240,14 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	u32 notTakenTarget = GetCompilerPC() + (delaySlotIsBranch ? 4 : 8);
 
 	ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(imm3));
+	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(notTakenTarget), IRTEMP_0, 0);
 
 	if (likely)
 		CompileDelaySlot();
 
 	// Taken
+	FlushAll();
 	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
 	js.compiling = false;
 }
@@ -251,6 +270,8 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 	u32 off = _IMM26 << 2;
 	u32 targetAddr = (GetCompilerPC() & 0xF0000000) | off;
 
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
 	// Might be a stubbed address or something?
 	if (!Memory::IsValidAddress(targetAddr)) {
 		if (js.nextExit == 0) {
@@ -270,8 +291,6 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 		break;
 
 	case 3: //jal
-		if (ReplaceJalTo(targetAddr))
-			return;
 		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
 		CompileDelaySlot();
 		FlushAll();
@@ -299,6 +318,8 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 	if (andLink && rs == rd)
 		delaySlotIsNice = false;
 
+	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+
 	int destReg;
 	if (IsSyscall(delaySlotOp)) {
 		gpr.MapDirty(rs);
@@ -336,7 +357,7 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 		break;
 	}
 
-	ir.Write(IROp::ExitToReg, ir.AddConstant(js.downcountAmount), rs, 0);
+	ir.Write(IROp::ExitToReg, destReg, 0, 0);
 	js.compiling = false;
 }
 
@@ -354,8 +375,7 @@ void IRJit::Comp_Syscall(MIPSOpcode op) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_Break(MIPSOpcode op)
-{
+void IRJit::Comp_Break(MIPSOpcode op) {
 	Comp_Generic(op);
 	js.compiling = false;
 }
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 00a8ec63991c..86e8d126e7a0 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -82,7 +82,7 @@ void IRJit::Comp_FPUComp(MIPSOpcode op) {
 	int opc = op & 0xF;
 	if (opc >= 8) opc -= 8; // alias
 	if (opc == 0) {  // f, sf (signalling false)
-		gpr.SetImm(MIPS_REG_FPCOND, 0);
+		gpr.SetImm((MIPSGPReg)IRREG_FPCOND, 0);
 		return;
 	}
 
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
index 53ea1f866fe9..fb0a143dd8a6 100644
--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -82,7 +82,8 @@ namespace MIPSComp {
 			return;
 		}
 
-		u32 iaddr = gpr.IsImm(rs) ? offset + gpr.GetImm(rs) : 0xFFFFFFFF;
+		gpr.MapIn(rs);
+		gpr.MapDirty(rt);
 		int addrReg = IRTEMP_0;
 		switch (o) {
 			// Load
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index cfcbe4349747..1e0cdabf0bb2 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -1,8 +1,13 @@
 #include "Core/MIPS/IR/IRInst.h"
+#include "Core/MIPS/IR/IRPassSimplify.h"
+#include "Core/MIPS/MIPSDebugInterface.h"
+#include "Core/MIPS/MIPSTables.h"
 #include "Core/MemMap.h"
+#include "Core/HLE/HLE.h"
 
 IRMeta meta[] = {
-	{ IROp::SetConst, "SetConst", "GC" },
+	{ IROp::SetConst, "SetConst", "GC_" },
+	{ IROp::Mov, "Mov", "GG" },
 	{ IROp::Add, "Add", "GGG" },
 	{ IROp::Sub, "Sub", "GGG" },
 	{ IROp::Neg, "Neg", "GG" },
@@ -23,9 +28,9 @@ IRMeta meta[] = {
 	{ IROp::ShrImm, "ShrImm", "GGI" },
 	{ IROp::SarImm, "SarImm", "GGI" },
 	{ IROp::RorImm, "RorImm", "GGI" },
-	{ IROp::Slt, "Slt","GGC" },
-	{ IROp::SltConst, "SltConst","GGC" },
-	{ IROp::SltU, "SltU", "GGC" },
+	{ IROp::Slt, "Slt", "GGG" },
+	{ IROp::SltConst, "SltConst", "GGC" },
+	{ IROp::SltU, "SltU", "GGG" },
 	{ IROp::SltUConst, "SltUConst", "GGC" },
 	{ IROp::Clz, "Clz", "GG" },
 	{ IROp::MovZ, "MovZ", "GGG" },
@@ -37,6 +42,14 @@ IRMeta meta[] = {
 	{ IROp::Mul, "Mul", "_GG" },
 	{ IROp::Ext8to32, "Ext8to32", "GG" },
 	{ IROp::Ext16to32, "Ext16to32", "GG" },
+	{ IROp::Load8, "Load8", "GGC" },
+	{ IROp::Load8Ext, "Load8", "GGC" },
+	{ IROp::Load16, "Load16", "GGC" },
+	{ IROp::Load16Ext, "Load16Ext", "GGC" },
+	{ IROp::Load32, "Load32", "GGC" },
+	{ IROp::Store8, "Store8", "GGC" },
+	{ IROp::Store16, "Store16", "GGC" },
+	{ IROp::Store32, "Store32", "GGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
@@ -57,8 +70,16 @@ IRMeta meta[] = {
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "T" },
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
+	{ IROp::ExitToConst, "Exit", "C" },
+	{ IROp::ExitToConstIfEq, "ExitIfEq", "CGG" },
+	{ IROp::ExitToConstIfNeq, "ExitIfNeq", "CGG" },
+	{ IROp::ExitToConstIfGtZ, "ExitIfGtZ", "CG" },
+	{ IROp::ExitToConstIfGeZ, "ExitIfGeZ", "CG" },
+	{ IROp::ExitToConstIfLeZ, "ExitIfLeZ", "CG" },
+	{ IROp::ExitToConstIfLtZ, "ExitIfLtZ", "CG" },
+	{ IROp::ExitToReg, "ExitToReg", "G" },
 	{ IROp::Syscall, "Syscall", "_C"},
-	{ IROp::SetPC, "SetPC", "_C"},
+	{ IROp::SetPC, "SetPC", "_G"},
 };
 
 const IRMeta *metaIndex[256];
@@ -82,9 +103,39 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::Sub:
 			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
 			break;
+		case IROp::And:
+			mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];
+			break;
+		case IROp::Or:
+			mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];
+			break;
+		case IROp::Xor:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];
+			break;
+		case IROp::Mov:
+			mips->r[inst->dest] = mips->r[inst->src1];
+			break;
+		case IROp::AddConst:
+			mips->r[inst->dest] = mips->r[inst->src1] + constPool[inst->src2];
+			break;
+		case IROp::SubConst:
+			mips->r[inst->dest] = mips->r[inst->src1] - constPool[inst->src2];
+			break;
+		case IROp::AndConst:
+			mips->r[inst->dest] = mips->r[inst->src1] & constPool[inst->src2];
+			break;
+		case IROp::OrConst:
+			mips->r[inst->dest] = mips->r[inst->src1] | constPool[inst->src2];
+			break;
+		case IROp::XorConst:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ constPool[inst->src2];
+			break;
 		case IROp::Neg:
 			mips->r[inst->dest] = -(s32)mips->r[inst->src1];
 			break;
+		case IROp::Not:
+			mips->r[inst->dest] = ~mips->r[inst->src1];
+			break;
 		case IROp::Ext8to32:
 			mips->r[inst->dest] = (s32)(s8)mips->r[inst->src1];
 			break;
@@ -152,6 +203,22 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		}
 		break;
 
+		case IROp::Slt:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
+			break;
+
+		case IROp::SltU:
+			mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];
+			break;
+
+		case IROp::SltConst:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)constPool[inst->src2];
+			break;
+
+		case IROp::SltUConst:
+			mips->r[inst->dest] = mips->r[inst->src1] < constPool[inst->src2];
+			break;
+
 		case IROp::MovZ:
 			if (mips->r[inst->src1] == 0)
 				mips->r[inst->dest] = mips->r[inst->src2];
@@ -208,10 +275,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 
 		case IROp::ExitToConst:
-			return constPool[inst->src1];
+			return constPool[inst->dest];
 
 		case IROp::ExitToReg:
-			return mips->r[inst->src1];
+			return mips->r[inst->dest];
 
 		case IROp::ExitToConstIfEq:
 			if (mips->r[inst->src1] == mips->r[inst->src2])
@@ -238,8 +305,28 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 				return constPool[inst->dest];
 			break;
 
+		case IROp::Downcount:
+			mips->downcount -= (inst->src1) | ((inst->src2) << 8);
+			break;
+
 		case IROp::SetPC:
-			return mips->pc = mips->r[inst->src1];
+			mips->pc = mips->r[inst->src1];
+			break;
+
+		case IROp::Syscall:
+			// SetPC was executed before.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			CallSyscall(op);
+			return mips->pc;
+		}
+
+		case IROp::Interpret:  // SLOW fallback. Can be made faster.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			MIPSInterpret(op);
+			break;
+		}
 
 		default:
 			Crash();
@@ -262,14 +349,13 @@ void IRWriter::Write(IROp op, u8 dst, u8 src1, u8 src2) {
 }
 
 void IRWriter::WriteSetConstant(u8 dst, u32 value) {
-	// TODO: Check for the fixed ones first.
-	Write(IROp::SetConstImm, AddConstant(value));
+	Write(IROp::SetConst, dst, AddConstant(value));
 }
 
 int IRWriter::AddConstant(u32 value) {
 	for (size_t i = 0; i < constPool_.size(); i++) {
 		if (constPool_[i] == value)
-			return i;
+			return (int)i;
 	}
 	constPool_.push_back(value);
 	return (int)constPool_.size() - 1;
@@ -281,10 +367,25 @@ int IRWriter::AddConstantFloat(float value) {
 	return AddConstant(val);
 }
 
+void IRWriter::Simplify() {
+	SimplifyInPlace(&insts_[0], insts_.size(), constPool_.data());
+}
+
+const char *GetGPRName(int r) {
+	if (r < 32) {
+		return currentDebugMIPS->GetRegName(0, r);
+	}
+	switch (r) {
+	case IRTEMP_0: return "irtemp0";
+	case IRTEMP_1: return "irtemp1";
+	default: return "(unk)";
+	}
+}
+
 void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *constPool) {
 	switch (type) {
 	case 'G':
-		snprintf(buf, bufSize, "r%d", param);
+		snprintf(buf, bufSize, "%s", GetGPRName(param));
 		break;
 	case 'F':
 		snprintf(buf, bufSize, "r%d", param);
@@ -292,6 +393,13 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'C':
 		snprintf(buf, bufSize, "%08x", constPool[param]);
 		break;
+	case 'I':
+		snprintf(buf, bufSize, "%02x", param);
+		break;
+	case '_':
+	case '\0':
+		buf[0] = 0;
+		break;
 	default:
 		snprintf(buf, bufSize, "?");
 		break;
@@ -300,17 +408,21 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 
 void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool) {
 	const IRMeta *meta = metaIndex[(int)inst.op];
+	if (!meta) {
+		snprintf(buf, bufsize, "Unknown %d", (int)inst.op);
+		return;
+	}
 	char bufDst[16];
 	char bufSrc1[16];
 	char bufSrc2[16];
 	DisassembleParam(bufDst, sizeof(bufDst) - 2, inst.dest, meta->types[0], constPool);
-	DisassembleParam(bufSrc1, sizeof(bufSrc1) - 2, inst.dest, meta->types[1], constPool);
-	DisassembleParam(bufSrc2, sizeof(bufSrc2), inst.dest, meta->types[2], constPool);
-	if (meta->types[1]) {
+	DisassembleParam(bufSrc1, sizeof(bufSrc1) - 2, inst.src1, meta->types[1], constPool);
+	DisassembleParam(bufSrc2, sizeof(bufSrc2), inst.src2, meta->types[2], constPool);
+	if (meta->types[1] && meta->types[0] != '_') {
 		strcat(bufDst, ", ");
 	}
-	if (meta->types[2]) {
+	if (meta->types[2] && meta->types[1] != '_') {
 		strcat(bufSrc1, ", ");
 	}
 	snprintf(buf, bufsize, "%s %s%s%s", meta->name, bufDst, bufSrc1, bufSrc2);
-}
\ No newline at end of file
+}
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index a1aa75edc7cf..c3cb6021de50 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -17,7 +17,6 @@
 
 enum class IROp : u8 {
 	SetConst,
-	SetConstImm,
 	FSetConst,
 
 	Mov,
@@ -202,6 +201,8 @@ enum {
 	// Hacky way to get to other state
 	IRREG_LO = 226,  // offset of lo in MIPSState / 4
 	IRREG_HI = 227,
+	IRREG_FCR31 = 228,
+	IRREG_FPCOND = 229
 };
 
 enum class IRParam {
@@ -249,6 +250,8 @@ class IRWriter {
 		constPool_.clear();
 	}
 
+	void Simplify();
+
 	const std::vector<IRInst> &GetInstructions() { return insts_; }
 	const std::vector<u32> &GetConstants() { return constPool_; }
 
@@ -258,3 +261,4 @@ class IRWriter {
 };
 
 void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool);
+void InitIR();
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index b16706b2d68e..7fae3255c34f 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -38,32 +38,18 @@
 #include "Core/MIPS/IR/IRJit.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 
-void DisassembleArm64Print(const u8 *data, int size) {
-	std::vector<std::string> lines = DisassembleArm64(data, size);
-	for (auto s : lines) {
-		ILOG("%s", s.c_str());
-	}
-	/*
-	ILOG("+++");
-	// A format friendly to Online Disassembler which gets endianness wrong
-	for (size_t i = 0; i < lines.size(); i++) {
-		uint32_t opcode = ((const uint32_t *)data)[i];
-		ILOG("%d/%d: %08x", (int)(i+1), (int)lines.size(), swap32(opcode));
-	}
-	ILOG("===");
-	ILOG("===");*/
-}
-
 namespace MIPSComp
 {
 
 IRJit::IRJit(MIPSState *mips) : gpr(), mips_(mips) { 
 	logBlocks = 0;
 	dontLogBlocks = 0;
-	js.startDefaultPrefix = mips_->HasDefaultPrefix();
+	js.startDefaultPrefix = true;
 	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 	u32 size = 128 * 1024;
 	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
+	logBlocks = 100;
+	InitIR();
 }
 
 IRJit::~IRJit() {
@@ -102,7 +88,8 @@ void IRJit::DoDummyState(PointerWrap &p) {
 }
 
 void IRJit::FlushAll() {
-	FlushPrefixV();
+	gpr.FlushAll();
+	// FlushPrefixV();
 }
 
 void IRJit::FlushPrefixV() {
@@ -162,6 +149,7 @@ void IRJit::Compile(u32 em_address) {
 	int block_num = blocks_.AllocateBlock(em_address);
 	IRBlock *b = blocks_.GetBlock(block_num);
 	DoJit(em_address, b);
+	b->Finalize(block_num);  // Overwrites the first instruction
 
 	bool cleanSlate = false;
 
@@ -192,7 +180,35 @@ void IRJit::Compile(u32 em_address) {
 
 void IRJit::RunLoopUntil(u64 globalticks) {
 	PROFILE_THIS_SCOPE("jit");
-	((void (*)())enterDispatcher)();
+
+	// ApplyRoundingMode(true);
+	// IR Dispatcher
+	
+	while (true) {
+		// RestoreRoundingMode(true);
+		CoreTiming::Advance();
+		// ApplyRoundingMode(true);
+		if (coreState != 0) {
+			break;
+		}
+		while (mips_->downcount >= 0) {
+			u32 inst = Memory::ReadUnchecked_U32(mips_->pc);
+			u32 opcode = inst >> 24;
+			u32 data = inst & 0xFFFFFF;
+			if (opcode == (MIPS_EMUHACK_OPCODE >> 24)) {
+				IRBlock *block = blocks_.GetBlock(data);
+				ILOG("Run block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
+				mips_->pc = IRInterpret(mips_, block->GetInstructions(), block->GetConstants(), block->GetNumInstructions());
+			} else {
+				// RestoreRoundingMode(true);
+				ILOG("Compile block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
+				Compile(mips_->pc);
+				// ApplyRoundingMode(true);
+			}
+		}
+	}
+
+	// RestoreRoundingMode(true);
 }
 
 u32 IRJit::GetCompilerPC() {
@@ -230,24 +246,28 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		js.numInstructions++;
 	}
 
+	ir.Simplify();
+
 	b->SetInstructions(ir.GetInstructions(), ir.GetConstants());
 
-	char temp[256];
 	if (logBlocks > 0 && dontLogBlocks == 0) {
+		char temp2[256];
 		ILOG("=============== mips %d ===============", blocks_.GetNumBlocks());
 		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
-			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp, true);
-			ILOG("M: %08x   %s", cpc, temp);
+			temp2[0] = 0;
+			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp2, true);
+			ILOG("M: %08x   %s", cpc, temp2);
 		}
 	}
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		ILOG("=============== IR (%d instructions) ===============", js.numInstructions);
-		for (int i = 0; i < js.numInstructions; i++) {
+		for (int i = 0; i < ir.GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
 			ILOG("%s", buf);
 		}
+		ILOG("===============        end         =================");
 	}
 
 	if (logBlocks > 0)
@@ -330,4 +350,15 @@ void IRBlockCache::InvalidateICache(u32 addess, u32 length) {
 	// TODO
 }
 
+void IRBlock::Finalize(int number) {
+	origFirstOpcode_= Memory::Read_Opcode_JIT(origAddr_);
+	MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
+	Memory::Write_Opcode_JIT(origAddr_, opcode);
+}
+
+MIPSOpcode IRJit::GetOriginalOp(MIPSOpcode op) {
+	IRBlock *b = blocks_.GetBlock(op.encoding & 0xFFFFFF);
+	return b->GetOriginalFirstOp();
+}
+
 }  // namespace MIPSComp
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 686eefe6c274..440e96d28286 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -34,8 +34,18 @@ namespace MIPSComp {
 // TODO : Use arena allocators. For now let's just malloc.
 class IRBlock {
 public:
-	IRBlock() {}
+	IRBlock() : instr_(nullptr), const_(nullptr), numInstructions_(0), numConstants_(0), origAddr_(0) {}
 	IRBlock(u32 emAddr) : instr_(nullptr), const_(nullptr), origAddr_(emAddr), numInstructions_(0) {}
+	IRBlock(IRBlock &&b) {
+		instr_ = b.instr_;
+		const_ = b.const_;
+		numInstructions_ = b.numInstructions_;
+		numConstants_ = b.numConstants_;
+		origAddr_ = b.origAddr_;
+		b.instr_ = nullptr;
+		b.const_ = nullptr;
+	}
+
 	~IRBlock() {
 		delete[] instr_;
 		delete[] const_;
@@ -50,12 +60,20 @@ class IRBlock {
 		memcpy(const_, constants.data(), sizeof(u32) * constants.size());
 	}
 
+	const IRInst *GetInstructions() const { return instr_; }
+	const u32 *GetConstants() const { return const_; }
+	int GetNumInstructions() const { return numInstructions_; }
+	MIPSOpcode GetOriginalFirstOp() const { return origFirstOpcode_; }
+
+	void Finalize(int number);
+
 private:
 	IRInst *instr_;
 	u32 *const_;
 	u16 numInstructions_;
 	u16 numConstants_;
 	u32 origAddr_;
+	MIPSOpcode origFirstOpcode_;
 };
 
 class IRBlockCache {
@@ -170,7 +188,8 @@ class IRJit : public JitInterface {
 	int Replace_fabsf();
 
 	// Not using a regular block cache.
-	JitBlockCache *GetBlockCache() { return nullptr; }
+	JitBlockCache *GetBlockCache() override { return nullptr; }
+	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
 
 	void ClearCache();
 	void InvalidateCache();
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
new file mode 100644
index 000000000000..e110b7380874
--- /dev/null
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -0,0 +1,14 @@
+#include "Core/MIPS/IR/IRPassSimplify.h"
+
+void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool) {
+	for (int i = 0; i < count; i++) {
+		switch (inst[i].op) {
+		case IROp::AddConst:
+			if (constPool[inst[i].src2] == 0)
+				inst[i].op = IROp::Mov;
+			break;
+		default:
+			break;
+		}
+	}
+}
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
new file mode 100644
index 000000000000..c798d89f92b0
--- /dev/null
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "Core/MIPS/IR/IRInst.h"
+
+void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index 7a31a463e4e5..808370ce6321 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -42,5 +42,7 @@ void IRRegCache::Start(IRWriter *ir) {
 }
 
 void IRRegCache::FlushAll() {
-
+	for (int i = 0; i < TOTAL_MAPPABLE_MIPSREGS; i++) {
+		Dirty((MIPSGPReg)i);
+	}
 }
diff --git a/Core/MIPS/JitCommon/JitCommon.h b/Core/MIPS/JitCommon/JitCommon.h
index 9c440a6cdbcb..e27707ea3558 100644
--- a/Core/MIPS/JitCommon/JitCommon.h
+++ b/Core/MIPS/JitCommon/JitCommon.h
@@ -57,6 +57,7 @@ namespace MIPSComp {
 		virtual void Compile(u32 em_address) = 0;
 		virtual void ClearCache() = 0;
 		virtual void EatPrefix() = 0;
+		virtual MIPSOpcode GetOriginalOp(MIPSOpcode op) = 0;
 
 		// Block linking. This may need to work differently for whole-function JITs and stuff
 		// like that.
diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp
index 3ada3ad123e3..c2c01a56f4d4 100644
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@@ -839,4 +839,14 @@ void Jit::CallProtectedFunction(const void *func, const OpArg &arg1, const u32 a
 
 void Jit::Comp_DoNothing(MIPSOpcode op) { }
 
+MIPSOpcode Jit::GetOriginalOp(MIPSOpcode op) {
+	JitBlockCache *bc = GetBlockCache();
+	int block_num = bc->GetBlockNumberFromEmuHackOp(op, true);
+	if (block_num >= 0) {
+		return bc->GetOriginalFirstOp(block_num);
+	} else {
+		return op;
+	}
+}
+
 } // namespace
diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h
index 68d160a9589f..a6f44443311a 100644
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@@ -161,6 +161,7 @@ class Jit : public Gen::XCodeBlock, public JitInterface {
 	void UpdateRoundingMode();
 
 	JitBlockCache *GetBlockCache() { return &blocks; }
+	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
 
 	void ClearCache();
 	void InvalidateCache() override;
diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp
index 557bc461c4a5..68e1f3e5a84f 100644
--- a/Core/MemMap.cpp
+++ b/Core/MemMap.cpp
@@ -479,13 +479,7 @@ Opcode Read_Opcode_JIT(u32 address)
 {
 	Opcode inst = Opcode(Read_U32(address));
 	if (MIPS_IS_RUNBLOCK(inst.encoding) && MIPSComp::jit) {
-		JitBlockCache *bc = MIPSComp::jit->GetBlockCache();
-		int block_num = bc->GetBlockNumberFromEmuHackOp(inst, true);
-		if (block_num >= 0) {
-			return bc->GetOriginalFirstOp(block_num);
-		} else {
-			return inst;
-		}
+		return MIPSComp::jit->GetOriginalOp(inst);
 	} else {
 		return inst;
 	}

From 12edfcea5aad7ff0509229154a2bfcb1d080ea5f Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 21:00:30 +0200
Subject: [PATCH 03/77] Enough to run cpu_alu.prx.

---
 Core/MIPS/IR/IRCompALU.cpp        | 56 +++++++++++++++++++++++++++++--
 Core/MIPS/IR/IRCompBranch.cpp     | 22 +++++++-----
 Core/MIPS/IR/IRInst.cpp           | 19 +++++++++--
 Core/MIPS/IR/IRJit.cpp            | 26 ++++++++++----
 Core/MIPS/IR/IRJit.h              |  7 +++-
 Core/MIPS/JitCommon/JitCommon.cpp | 18 +++++-----
 Core/MIPS/MIPSTables.cpp          | 14 ++++++++
 Core/MIPS/x86/Asm.cpp             |  2 +-
 Core/MIPS/x86/Jit.cpp             |  3 +-
 9 files changed, 133 insertions(+), 34 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 69cf25de5604..7c360082c9ed 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -153,7 +153,7 @@ void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp co
 		}
 		return;
 	}
-
+	/*
 	if (gpr.IsImm(rt) || (gpr.IsImm(rs) && symmetric)) {
 		MIPSGPReg lhs = gpr.IsImm(rs) ? rt : rs;
 		MIPSGPReg rhs = gpr.IsImm(rs) ? rs : rt;
@@ -167,7 +167,7 @@ void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp co
 			gpr.SetImm(rhs, rhsImm);
 		}
 		return;
-	}
+	}*/
 
 	// Can't do the RSB optimization on ARM64 - no RSB!
 
@@ -343,7 +343,57 @@ void IRJit::Comp_ShiftType(MIPSOpcode op) {
 }
 
 void IRJit::Comp_Special3(MIPSOpcode op) {
-	DISABLE;
+	CONDITIONAL_DISABLE;
+
+	MIPSGPReg rs = _RS;
+	MIPSGPReg rt = _RT;
+
+	int pos = _POS;
+	int size = _SIZE + 1;
+	u32 mask = 0xFFFFFFFFUL >> (32 - size);
+
+	// Don't change $zr.
+	if (rt == 0)
+		return;
+
+	switch (op & 0x3f) {
+	case 0x0: //ext
+		if (gpr.IsImm(rs)) {
+			gpr.SetImm(rt, (gpr.GetImm(rs) >> pos) & mask);
+			return;
+		}
+
+		gpr.MapDirtyIn(rt, rs);
+		ir.Write(IROp::Shl, rt, rs);
+		ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(mask));
+		break;
+
+	case 0x4: //ins
+	{
+		u32 sourcemask = mask >> pos;
+		u32 destmask = ~(sourcemask << pos);
+		if (gpr.IsImm(rs)) {
+			u32 inserted = (gpr.GetImm(rs) & sourcemask) << pos;
+			if (gpr.IsImm(rt)) {
+				gpr.SetImm(rt, (gpr.GetImm(rt) & destmask) | inserted);
+				return;
+			}
+
+			gpr.MapDirty(rt);
+			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
+			if (inserted != 0) {
+				ir.Write(IROp::OrConst, rt, rt, inserted);
+			}
+		} else {
+			gpr.MapDirtyIn(rt, rs);
+			ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(sourcemask));
+			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
+			ir.Write(IROp::ShlImm, IRTEMP_0, IRTEMP_0, pos);
+			ir.Write(IROp::Or, rt, rt, IRTEMP_0);
+		}
+	}
+	break;
+	}
 }
 
 void IRJit::Comp_Allegrex(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 7d01d0b685da..27fb5ae52d4e 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -72,12 +72,14 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 
 	MIPSGPReg lhs = rs;
 	MIPSGPReg rhs = rt;
-	if (!delaySlotIsNice && !likely) {  // if likely, we don't need this
+	if (!delaySlotIsNice) {  // if likely, we don't need this
 		if (rs != 0) {
+			gpr.MapIn(rs);
 			ir.Write(IROp::Mov, IRTEMP_0, rs);
 			lhs = (MIPSGPReg)IRTEMP_0;
 		}
 		if (rt != 0) {
+			gpr.MapIn(rt);
 			ir.Write(IROp::Mov, IRTEMP_1, rt);
 			rhs = (MIPSGPReg)IRTEMP_1;
 		}
@@ -113,21 +115,22 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 
 	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
 
-	if (!likely && delaySlotIsNice)
-		CompileDelaySlot();
-	int lhs = rs;
-	gpr.MapIn(rs);
-	if (!delaySlotIsNice && !likely) {  // if likely, we don't need this
+	MIPSGPReg lhs = rs;
+	if (!delaySlotIsNice) {  // if likely, we don't need this
 		ir.Write(IROp::Mov, IRTEMP_0, rs);
-		lhs = IRTEMP_0;
+		lhs = (MIPSGPReg)IRTEMP_0;
 	}
 	if (andLink)
 		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
+
+	if (!likely)
+		CompileDelaySlot();
+
+	gpr.MapIn(lhs);
 	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs);
-	if (likely) {
+	if (likely)
 		CompileDelaySlot();
-	}
 	// Taken
 	FlushAll();
 	ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
@@ -327,6 +330,7 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 		if (andLink)
 			gpr.SetImm(rd, GetCompilerPC() + 8);
 		CompileDelaySlot();
+		// Syscall (the delay slot) does FlushAll.
 		return;  // Syscall (delay slot) wrote exit code.
 	} else if (delaySlotIsNice) {
 		if (andLink)
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 1e0cdabf0bb2..fc4a07a2ec2d 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -170,13 +170,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 
 		case IROp::ShlImm:
-			mips->r[inst->dest] = mips->r[inst->src1] << inst->src2;
+			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
 			break;
 		case IROp::ShrImm:
-			mips->r[inst->dest] = mips->r[inst->src1] >> inst->src2;
+			mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;
 			break;
 		case IROp::SarImm:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> inst->src2;
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;
 			break;
 		case IROp::RorImm:
 		{
@@ -203,6 +203,19 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		}
 		break;
 
+		case IROp::Clz:
+		{
+			int x = 31;
+			int count = 0;
+			int value = mips->r[inst->src1];
+			while (x >= 0 && !(value & (1 << x))) {
+				count++;
+				x--;
+			}
+			mips->r[inst->dest] = count;
+			break;
+		}
+
 		case IROp::Slt:
 			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
 			break;
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 7fae3255c34f..d99cebfc3ec8 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -48,7 +48,7 @@ IRJit::IRJit(MIPSState *mips) : gpr(), mips_(mips) {
 	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 	u32 size = 128 * 1024;
 	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
-	logBlocks = 100;
+	logBlocks = 0;
 	InitIR();
 }
 
@@ -184,6 +184,12 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 	// ApplyRoundingMode(true);
 	// IR Dispatcher
 	
+	FILE *f;
+	int numBlocks = 0;
+	if (numBlocks) {
+		f = fopen("E:\\blockir.txt", "w");
+	}
+
 	while (true) {
 		// RestoreRoundingMode(true);
 		CoreTiming::Advance();
@@ -197,11 +203,18 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 			u32 data = inst & 0xFFFFFF;
 			if (opcode == (MIPS_EMUHACK_OPCODE >> 24)) {
 				IRBlock *block = blocks_.GetBlock(data);
-				ILOG("Run block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
+				if (numBlocks > 0) {
+					// ILOG("Run block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
+					fprintf(f, "BLOCK : %08x v0: %08x v1: %08x a0: %08x s0: %08x s4: %08x\n", mips_->pc, mips_->r[MIPS_REG_V0], mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0], mips_->r[MIPS_REG_S0], mips_->r[MIPS_REG_S4]);
+					fflush(f);
+					numBlocks--;
+				}
 				mips_->pc = IRInterpret(mips_, block->GetInstructions(), block->GetConstants(), block->GetNumInstructions());
 			} else {
+				if (mips_->pc == 0x0880de94)
+					logBlocks = 10;
 				// RestoreRoundingMode(true);
-				ILOG("Compile block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
+				// ILOG("Compile block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
 				Compile(mips_->pc);
 				// ApplyRoundingMode(true);
 			}
@@ -252,7 +265,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		char temp2[256];
-		ILOG("=============== mips %d ===============", blocks_.GetNumBlocks());
+		ILOG("=============== mips %d %08x ===============", blocks_.GetNumBlocks(), em_address);
 		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
 			temp2[0] = 0;
 			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp2, true);
@@ -304,7 +317,8 @@ void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
 }
 
 void IRJit::Comp_Generic(MIPSOpcode op) {
-	ir.Write(IROp::Interpret, ir.AddConstant(op.encoding));
+	FlushAll();
+	ir.Write(IROp::Interpret, 0, ir.AddConstant(op.encoding));
 	const MIPSInfo info = MIPSGetInfo(op);
 	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) {
 		// If it does eat them, it'll happen in MIPSCompileOp().
@@ -351,7 +365,7 @@ void IRBlockCache::InvalidateICache(u32 addess, u32 length) {
 }
 
 void IRBlock::Finalize(int number) {
-	origFirstOpcode_= Memory::Read_Opcode_JIT(origAddr_);
+	origFirstOpcode_ = Memory::Read_Opcode_JIT(origAddr_);
 	MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
 	Memory::Write_Opcode_JIT(origAddr_, opcode);
 }
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 440e96d28286..63badb5ce2cf 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -42,6 +42,7 @@ class IRBlock {
 		numInstructions_ = b.numInstructions_;
 		numConstants_ = b.numConstants_;
 		origAddr_ = b.origAddr_;
+		origFirstOpcode_ = b.origFirstOpcode_;
 		b.instr_ = nullptr;
 		b.const_ = nullptr;
 	}
@@ -86,7 +87,11 @@ class IRBlockCache {
 		return (int)blocks_.size() - 1;
 	}
 	IRBlock *GetBlock(int i) {
-		return &blocks_[i];
+		if (i >= 0 && i < blocks_.size()) {
+			return &blocks_[i];
+		} else {
+			return nullptr;
+		}
 	}
 private:
 	std::vector<IRBlock> blocks_;
diff --git a/Core/MIPS/JitCommon/JitCommon.cpp b/Core/MIPS/JitCommon/JitCommon.cpp
index 0b7fc1932244..630494f97329 100644
--- a/Core/MIPS/JitCommon/JitCommon.cpp
+++ b/Core/MIPS/JitCommon/JitCommon.cpp
@@ -47,21 +47,21 @@ namespace MIPSComp {
 	}
 
 	JitInterface *CreateNativeJit(MIPSState *mips) {
-		if (false && g_Config.iCpuCore == (int)CPUCore::CPU_JIT) {
+#if 1
+		return new MIPSComp::IRJit(mips);
+#else
 #if defined(ARM)
-			return new MIPSComp::ArmJit(mips);
+		return new MIPSComp::ArmJit(mips);
 #elif defined(ARM64)
-			return new MIPSComp::IRJit(mips);
+		return new MIPSComp::IRJit(mips);
 #elif defined(_M_IX86) || defined(_M_X64)
-			return new MIPSComp::Jit(mips);
+		return new MIPSComp::Jit(mips);
 #elif defined(MIPS)
-			return new MIPSComp::MipsJit(mips);
+		return new MIPSComp::MipsJit(mips);
 #else
-			return new MIPSComp::FakeJit(mips);
+		return new MIPSComp::FakeJit(mips);
+#endif
 #endif
-		} else if (true || g_Config.iCpuCore == (int)CPUCore::CPU_IRJIT) {
-			return new MIPSComp::IRJit(mips);
-		}
 	}
 
 }
diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp
index 39038df6ae9e..c5ef44bc0d04 100644
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@@ -28,6 +28,7 @@
 #include "Core/CoreTiming.h"
 #include "Core/Reporting.h"
 #include "Core/Debugger/Breakpoints.h"
+#include "base/logging.h"
 
 #include "JitCommon/JitCommon.h"
 
@@ -973,10 +974,13 @@ void MIPSInterpret(MIPSOpcode op) {
 
 int MIPSInterpret_RunUntil(u64 globalTicks)
 {
+	int blockCount = 150000;
+	FILE *f = fopen("E:\\blockjit.txt", "w");
 	MIPSState *curMips = currentMIPS;
 	while (coreState == CORE_RUNNING)
 	{
 		CoreTiming::Advance();
+		u32 lastPC = 0;
 
 		// NEVER stop in a delay slot!
 		while (curMips->downcount >= 0 && coreState == CORE_RUNNING)
@@ -1015,6 +1019,16 @@ int MIPSInterpret_RunUntil(u64 globalTicks)
 
 				bool wasInDelaySlot = curMips->inDelaySlot;
 
+				if (curMips->pc != lastPC + 4) {
+					if (blockCount > 0) {
+						MIPSState *mips_ = curMips;
+						fprintf(f, "BLOCK : %08x v0: %08x v1: %08x a0: %08x s0: %08x s4: %08x\n", mips_->pc, mips_->r[MIPS_REG_V0], mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0], mips_->r[MIPS_REG_S0], mips_->r[MIPS_REG_S4]);
+						fflush(f);
+						blockCount--;
+					}
+				}
+				lastPC = curMips->pc;
+
 				MIPSInterpret(op);
 
 				if (curMips->inDelaySlot)
diff --git a/Core/MIPS/x86/Asm.cpp b/Core/MIPS/x86/Asm.cpp
index 6d03ba5f6fbc..86dfc1d7fb1f 100644
--- a/Core/MIPS/x86/Asm.cpp
+++ b/Core/MIPS/x86/Asm.cpp
@@ -40,7 +40,7 @@ namespace MIPSComp
 
 //TODO - make an option
 //#if _DEBUG
-static bool enableDebug = false; 
+	static bool enableDebug = true;
 
 //#else
 //		bool enableDebug = false; 
diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp
index c2c01a56f4d4..4bfce6814e85 100644
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@@ -81,8 +81,7 @@ u32 JitBreakpoint()
 	host->SetDebugMode(true);
 
 	// There's probably a better place for this.
-	if (USE_JIT_MISSMAP)
-	{
+	if (USE_JIT_MISSMAP) {
 		std::map<u32, std::string> notJitSorted;
 		std::transform(notJitOps.begin(), notJitOps.end(), std::inserter(notJitSorted, notJitSorted.begin()), flip_pair<std::string, u32>);
 

From 38b7d89dfbbf49a036a1ee4536e52085bd4265e0 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 21:34:27 +0200
Subject: [PATCH 04/77] Fix a performance issue in CallSyscall

---
 Core/HLE/HLE.cpp         | 7 ++++---
 Core/HLE/ReplaceTables.h | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Core/HLE/HLE.cpp b/Core/HLE/HLE.cpp
index a4a448f574d7..2083b1ba24e8 100644
--- a/Core/HLE/HLE.cpp
+++ b/Core/HLE/HLE.cpp
@@ -70,6 +70,7 @@ static int delayedResultEvent = -1;
 static int hleAfterSyscall = HLE_AFTER_NOTHING;
 static const char *hleAfterSyscallReschedReason;
 static const HLEFunction *latestSyscall = nullptr;
+static int idleOp;
 
 void hleDelayResultFinish(u64 userdata, int cycleslate)
 {
@@ -93,6 +94,7 @@ void HLEInit()
 {
 	RegisterAllModules();
 	delayedResultEvent = CoreTiming::RegisterEvent("HLEDelayedResult", hleDelayResultFinish);
+	idleOp = GetSyscallOp("FakeSysCalls", NID_IDLE);
 }
 
 void HLEDoState(PointerWrap &p)
@@ -540,9 +542,8 @@ void CallSyscall(MIPSOpcode op)
 		return;
 	}
 
-	if (info->func)
-	{
-		if (op == GetSyscallOp("FakeSysCalls", NID_IDLE))
+	if (info->func) {
+		if (op == idleOp)
 			info->func();
 		else if (info->flags != 0)
 			CallSyscallWithFlags(info);
diff --git a/Core/HLE/ReplaceTables.h b/Core/HLE/ReplaceTables.h
index 84f85b7eea35..94ee26d69ea1 100644
--- a/Core/HLE/ReplaceTables.h
+++ b/Core/HLE/ReplaceTables.h
@@ -33,6 +33,8 @@
 
 #pragma once
 
+#include <map>
+
 #include "Common/CommonTypes.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 

From 3c5510e5a36be46800dcdb7a8417f90007231368 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 21:35:12 +0200
Subject: [PATCH 05/77] Disable debug file logging, fix issue with replacement
 functions, etc

---
 Core/MIPS/IR/IRCompFPU.cpp |  2 +-
 Core/MIPS/IR/IRInst.cpp    | 64 ++++++++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRInst.h      |  1 +
 Core/MIPS/IR/IRJit.cpp     | 28 ++++++++++++++++-
 Core/MIPS/MIPSTables.cpp   |  5 ++-
 Core/MemMap.cpp            | 28 +++++++----------
 6 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 86e8d126e7a0..8d9d7b2aa5f2 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -77,7 +77,7 @@ void IRJit::Comp_FPULS(MIPSOpcode op) {
 }
 
 void IRJit::Comp_FPUComp(MIPSOpcode op) {
-	CONDITIONAL_DISABLE;
+	DISABLE;
 
 	int opc = op & 0xF;
 	if (opc >= 8) opc -= 8; // alias
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index fc4a07a2ec2d..4876ce697da1 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -4,6 +4,9 @@
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/MemMap.h"
 #include "Core/HLE/HLE.h"
+#include "Core/HLE/ReplaceTables.h"
+
+#include "math/math_util.h"
 
 IRMeta meta[] = {
 	{ IROp::SetConst, "SetConst", "GC_" },
@@ -286,6 +289,58 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::FpCondToReg:
 			mips->r[inst->dest] = mips->fpcond;
 			break;
+		case IROp::FRound:
+			mips->r[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
+			break;
+		case IROp::FTrunc:
+		{
+			float src = mips->f[inst->src1];
+			if (src >= 0.0f) {
+				mips->fs[inst->dest] = (int)floorf(src);
+				// Overflow, but it was positive.
+				if (mips->fs[inst->dest] == -2147483648LL) {
+					mips->fs[inst->dest] = 2147483647LL;
+				}
+			} else {
+				// Overflow happens to be the right value anyway.
+				mips->fs[inst->dest] = (int)ceilf(src);
+			}
+			break;
+		}
+		case IROp::FCeil:
+			mips->r[inst->dest] = (int)ceilf(mips->f[inst->src1]);
+			break;
+		case IROp::FFloor:
+			mips->r[inst->dest] = (int)floorf(mips->f[inst->src1]);
+			break;
+
+		case IROp::FCvtSW:
+			mips->f[inst->dest] = (float)mips->fs[inst->src1];
+			break;
+		case IROp::FCvtWS:
+		{
+			float src = mips->f[inst->src1];
+			if (my_isnanorinf(src))
+			{
+				mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;
+				break;
+			}
+			switch (mips->fcr31 & 3)
+			{
+			case 0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;  // RINT_0
+			case 1: mips->fs[inst->dest] = (int)src; break;  // CAST_1
+			case 2: mips->fs[inst->dest] = (int)ceilf(src); break;  // CEIL_2
+			case 3: mips->fs[inst->dest] = (int)floorf(src); break;  // FLOOR_3
+			}
+			break; //cvt.w.s
+		}
+
+		case IROp::FMovFromGPR:
+			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
+			break;
+		case IROp::FMovToGPR:
+			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
+			break;
 
 		case IROp::ExitToConst:
 			return constPool[inst->dest];
@@ -341,6 +396,15 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 
+		case IROp::CallReplacement:
+		{
+			int funcIndex = constPool[inst->src1];
+			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
+			int cycles = f->replaceFunc();
+			mips->downcount -= cycles;
+			break;
+		}
+
 		default:
 			Crash();
 		}
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index c3cb6021de50..988601993c7e 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -152,6 +152,7 @@ enum class IROp : u8 {
 
 	Syscall,
 	SetPC,  // hack to make syscall returns work
+	CallReplacement,
 	Break,
 };
 
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index d99cebfc3ec8..6e1317cdc8f3 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -313,7 +313,33 @@ bool IRJit::ReplaceJalTo(u32 dest) {
 }
 
 void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
-	Crash();
+	int index = op.encoding & MIPS_EMUHACK_VALUE_MASK;
+
+	const ReplacementTableEntry *entry = GetReplacementFunc(index);
+	if (!entry) {
+		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
+		return;
+	}
+
+	if (entry->flags & REPFLAG_DISABLED) {
+		MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
+	} else if (entry->replaceFunc) {
+		FlushAll();
+		RestoreRoundingMode();
+		ir.Write(IROp::SetPC, 0, ir.AddConstant(GetCompilerPC()));
+		ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
+
+		if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
+			// Compile the original instruction at this address.  We ignore cycles for hooks.
+			ApplyRoundingMode();
+			MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
+		} else {
+			ApplyRoundingMode();
+			js.compiling = false;
+		}
+	} else {
+		ERROR_LOG(HLE, "Replacement function %s has neither jit nor regular impl", entry->name);
+	}
 }
 
 void IRJit::Comp_Generic(MIPSOpcode op) {
diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp
index c5ef44bc0d04..f0b51db0c0df 100644
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@@ -974,8 +974,6 @@ void MIPSInterpret(MIPSOpcode op) {
 
 int MIPSInterpret_RunUntil(u64 globalTicks)
 {
-	int blockCount = 150000;
-	FILE *f = fopen("E:\\blockjit.txt", "w");
 	MIPSState *curMips = currentMIPS;
 	while (coreState == CORE_RUNNING)
 	{
@@ -1019,6 +1017,7 @@ int MIPSInterpret_RunUntil(u64 globalTicks)
 
 				bool wasInDelaySlot = curMips->inDelaySlot;
 
+				/*
 				if (curMips->pc != lastPC + 4) {
 					if (blockCount > 0) {
 						MIPSState *mips_ = curMips;
@@ -1028,7 +1027,7 @@ int MIPSInterpret_RunUntil(u64 globalTicks)
 					}
 				}
 				lastPC = curMips->pc;
-
+				*/
 				MIPSInterpret(op);
 
 				if (curMips->inDelaySlot)
diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp
index 68e1f3e5a84f..0d3b3817ccdc 100644
--- a/Core/MemMap.cpp
+++ b/Core/MemMap.cpp
@@ -426,26 +426,22 @@ __forceinline static Opcode Read_Instruction(u32 address, bool resolveReplacemen
 
 	if (MIPS_IS_RUNBLOCK(inst.encoding) && MIPSComp::jit) {
 		JitBlockCache *bc = MIPSComp::jit->GetBlockCache();
-		int block_num = bc->GetBlockNumberFromEmuHackOp(inst, true);
-		if (block_num >= 0) {
-			inst = bc->GetOriginalFirstOp(block_num);
-			if (resolveReplacements && MIPS_IS_REPLACEMENT(inst)) {
-				u32 op;
-				if (GetReplacedOpAt(address, &op)) {
-					if (MIPS_IS_EMUHACK(op)) {
-						ERROR_LOG(HLE,"WTF 1");
-						return Opcode(op);
-					} else {
-						return Opcode(op);
-					}
+
+		inst = MIPSComp::jit->GetOriginalOp(inst);
+		if (resolveReplacements && MIPS_IS_REPLACEMENT(inst)) {
+			u32 op;
+			if (GetReplacedOpAt(address, &op)) {
+				if (MIPS_IS_EMUHACK(op)) {
+					ERROR_LOG(HLE,"WTF 1");
+					return Opcode(op);
 				} else {
-					ERROR_LOG(HLE, "Replacement, but no replacement op? %08x", inst.encoding);
+					return Opcode(op);
 				}
+			} else {
+				ERROR_LOG(HLE, "Replacement, but no replacement op? %08x", inst.encoding);
 			}
-			return inst;
-		} else {
-			return inst;
 		}
+		return inst;
 	} else if (resolveReplacements && MIPS_IS_REPLACEMENT(inst.encoding)) {
 		u32 op;
 		if (GetReplacedOpAt(address, &op)) {

From 750d520cc7361e943636a4128274ccdade0fac60 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 22:27:58 +0200
Subject: [PATCH 06/77] Initial work on mult and friends. Fix another bug.

---
 Core/MIPS/IR/IRCompALU.cpp    | 85 +++++++++++++++++++++++++++--------
 Core/MIPS/IR/IRCompBranch.cpp |  2 +-
 Core/MIPS/IR/IRInst.cpp       | 60 ++++++++++++++++++++++---
 Core/MIPS/IR/IRInst.h         | 13 ++++--
 Core/MIPS/IR/IRJit.cpp        |  3 +-
 Core/MIPS/IR/IRPassSimplify.h |  1 +
 6 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 7c360082c9ed..7a55c9f8c3d4 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -153,21 +153,6 @@ void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp co
 		}
 		return;
 	}
-	/*
-	if (gpr.IsImm(rt) || (gpr.IsImm(rs) && symmetric)) {
-		MIPSGPReg lhs = gpr.IsImm(rs) ? rt : rs;
-		MIPSGPReg rhs = gpr.IsImm(rs) ? rs : rt;
-		u32 rhsImm = gpr.GetImm(rhs);
-		gpr.MapDirtyIn(rd, lhs);
-		ir.Write(constOp, rd, lhs, ir.AddConstant(rhsImm));
-		// If rd is rhs, we may have lost it in the MapDirtyIn().  lhs was kept.
-		// This means the rhsImm value was never flushed to rhs, and would be garbage.
-		if (rd == rhs) {
-			// Luckily, it was just an imm.
-			gpr.SetImm(rhs, rhsImm);
-		}
-		return;
-	}*/
 
 	// Can't do the RSB optimization on ARM64 - no RSB!
 
@@ -463,15 +448,77 @@ void IRJit::Comp_Allegrex2(MIPSOpcode op) {
 
 void IRJit::Comp_MulDivType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
+	DISABLE;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
 	MIPSGPReg rd = _RD;
 
-	// Note that in all cases below, LO is actually mapped to HI:LO.
-	// That is, the host reg is 64 bits and has HI at the top.
-	// HI is not mappable.
+	switch (op & 63) {
+	case 16: // R(rd) = HI; //mfhi
+		if (rd != MIPS_REG_ZERO) {
+			gpr.MapDirty(rd);
+			ir.Write(IROp::MfHi, rd);
+		}
+		break;
+
+	case 17: // HI = R(rs); //mthi
+		gpr.MapIn(rs);
+		ir.Write(IROp::MtHi, 0, rs);
+		break;
 
-	DISABLE;
+	case 18: // R(rd) = LO; break; //mflo
+		if (rd != MIPS_REG_ZERO) {
+			gpr.MapDirty(rd);
+			ir.Write(IROp::MfLo, rd);
+		}
+		break;
+
+	case 19: // LO = R(rs); break; //mtlo
+		gpr.MapIn(rs);
+		ir.Write(IROp::MtLo, 0, rs);
+		break;
+
+	case 24: //mult (the most popular one). lo,hi  = signed mul (rs * rt)
+		ir.Write(IROp::Mult, 0, rs, rt);
+		break;
+
+	case 25: //multu (2nd) lo,hi  = unsigned mul (rs * rt)
+		ir.Write(IROp::MultU, 0, rs, rt);
+		break;
+
+	case 26: //div
+		DISABLE;
+		ir.Write(IROp::Div, 0, rs, rt);
+		break;
+
+	case 27: //divu
+		DISABLE;
+		ir.Write(IROp::DivU, 0, rs, rt);
+		break;
+
+	case 28: //madd
+		DISABLE;
+		ir.Write(IROp::Madd, 0, rs, rt);
+		break;
+
+	case 29: //maddu
+		DISABLE;
+		ir.Write(IROp::MaddU, 0, rs, rt);
+		break;
+
+	case 46: // msub
+		DISABLE;
+		ir.Write(IROp::Msub, 0, rs, rt);
+		break;
+
+	case 47: // msubu
+		DISABLE;
+		ir.Write(IROp::MsubU, 0, rs, rt);
+		break;
+
+	default:
+		DISABLE;
+	}
 }
 
 }
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 27fb5ae52d4e..7a4601838f4a 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -380,7 +380,7 @@ void IRJit::Comp_Syscall(MIPSOpcode op) {
 }
 
 void IRJit::Comp_Break(MIPSOpcode op) {
-	Comp_Generic(op);
+	ir.Write(IROp::Break);
 	js.compiling = false;
 }
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 4876ce697da1..460bca7c9052 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -8,7 +8,7 @@
 
 #include "math/math_util.h"
 
-IRMeta meta[] = {
+static const IRMeta irMeta[] = {
 	{ IROp::SetConst, "SetConst", "GC_" },
 	{ IROp::Mov, "Mov", "GG" },
 	{ IROp::Add, "Add", "GGG" },
@@ -42,7 +42,18 @@ IRMeta meta[] = {
 	{ IROp::Min, "Min", "GGG" },
 	{ IROp::BSwap16, "BSwap16", "GG" },
 	{ IROp::BSwap32, "BSwap32", "GG" },
-	{ IROp::Mul, "Mul", "_GG" },
+	{ IROp::Mult, "Mult", "_GG" },
+	{ IROp::MultU, "MultU", "_GG" },
+	{ IROp::Madd, "Madd", "_GG" },
+	{ IROp::MaddU, "MaddU", "_GG" },
+	{ IROp::Msub, "Msub", "_GG" },
+	{ IROp::MsubU, "MsubU", "_GG" },
+	{ IROp::Div, "Div", "_GG" },
+	{ IROp::DivU, "DivU", "_GG" },
+	{ IROp::MtLo, "MtLo", "_G" },
+	{ IROp::MtHi, "MtHi", "_G" },
+	{ IROp::MfLo, "MfLo", "G" },
+	{ IROp::MfHi, "MfHi", "G" },
 	{ IROp::Ext8to32, "Ext8to32", "GG" },
 	{ IROp::Ext16to32, "Ext16to32", "GG" },
 	{ IROp::Load8, "Load8", "GGC" },
@@ -81,15 +92,16 @@ IRMeta meta[] = {
 	{ IROp::ExitToConstIfLeZ, "ExitIfLeZ", "CG" },
 	{ IROp::ExitToConstIfLtZ, "ExitIfLtZ", "CG" },
 	{ IROp::ExitToReg, "ExitToReg", "G" },
-	{ IROp::Syscall, "Syscall", "_C"},
+	{ IROp::Syscall, "Syscall", "_C" },
+	{ IROp::Break, "Break", ""},
 	{ IROp::SetPC, "SetPC", "_G"},
 };
 
 const IRMeta *metaIndex[256];
 
 void InitIR() {
-	for (size_t i = 0; i < ARRAY_SIZE(meta); i++) {
-		metaIndex[(int)meta[i].op] = &meta[i];
+	for (size_t i = 0; i < ARRAY_SIZE(irMeta); i++) {
+		metaIndex[(int)irMeta[i].op] = &irMeta[i];
 	}
 }
 
@@ -251,6 +263,32 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
 			break;
 
+		case IROp::MtLo:
+			mips->lo = mips->r[inst->src1];
+			break;
+		case IROp::MtHi:
+			mips->hi = mips->r[inst->src1];
+			break;
+		case IROp::MfLo:
+			mips->r[inst->dest] = mips->lo;
+			break;
+		case IROp::MfHi:
+			mips->r[inst->dest] = mips->hi;
+			break;
+
+		case IROp::Mult:
+		{
+			s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+		case IROp::MultU:
+		{
+			u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+
 		case IROp::BSwap16:
 		{
 			u32 x = mips->r[inst->src1];
@@ -381,6 +419,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			mips->pc = mips->r[inst->src1];
 			break;
 
+		case IROp::SetPCConst:
+			mips->pc = constPool[inst->src1];
+			break;
+
 		case IROp::Syscall:
 			// SetPC was executed before.
 		{
@@ -402,9 +444,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
 			int cycles = f->replaceFunc();
 			mips->downcount -= cycles;
-			break;
+			return mips->r[MIPS_REG_RA];
 		}
 
+		case IROp::Break:
+			Crash();
+			break;
+
 		default:
 			Crash();
 		}
@@ -412,7 +458,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 	}
 
 	// If we got here, the block was badly constructed.
-	// Crash();
+	Crash();
 	return 0;
 }
 
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 988601993c7e..d3fecabe69b1 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -66,13 +66,19 @@ enum class IROp : u8 {
 	BSwap16,  // Swaps both the high and low byte pairs.
 	BSwap32,
 
-	// Hi/Lo semantics preserved.
-	Mul,
-	MulU,
+	// Weird Hi/Lo semantics preserved. Too annoying to do something more generic.
+	MtLo,
+	MtHi,
+	MfLo,
+	MfHi,
+	Mult,
+	MultU,
 	Madd,
 	MaddU,
 	Msub,
 	MsubU,
+	Div,
+	DivU,
 	
 	// These take a constant from the pool as an offset.
 	// Loads from a constant address can be represented by using r0.
@@ -152,6 +158,7 @@ enum class IROp : u8 {
 
 	Syscall,
 	SetPC,  // hack to make syscall returns work
+	SetPCConst,  // hack to make replacement know PC
 	CallReplacement,
 	Break,
 };
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 6e1317cdc8f3..2f88b7740f1c 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -273,6 +273,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		}
 	}
 
+
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		ILOG("=============== IR (%d instructions) ===============", js.numInstructions);
 		for (int i = 0; i < ir.GetInstructions().size(); i++) {
@@ -326,7 +327,7 @@ void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
 	} else if (entry->replaceFunc) {
 		FlushAll();
 		RestoreRoundingMode();
-		ir.Write(IROp::SetPC, 0, ir.AddConstant(GetCompilerPC()));
+		ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
 		ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
 
 		if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index c798d89f92b0..8706661704b6 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -2,4 +2,5 @@
 
 #include "Core/MIPS/IR/IRInst.h"
 
+// Dumb example of a simplification pass that can't add or remove instructions.
 void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);

From a33f8b68c62bd483d8a8c018398d9be9262313d3 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 7 May 2016 23:12:53 +0200
Subject: [PATCH 07/77] ir-jit: Get rid of the regcache. Should be replaced
 with optimization passes.

---
 Core/MIPS/IR/IRCompALU.cpp       | 197 ++++---------------------------
 Core/MIPS/IR/IRCompBranch.cpp    |  18 +--
 Core/MIPS/IR/IRCompFPU.cpp       |  11 +-
 Core/MIPS/IR/IRCompLoadStore.cpp |   2 -
 Core/MIPS/IR/IRInst.cpp          |   7 ++
 Core/MIPS/IR/IRInst.h            |   1 +
 Core/MIPS/IR/IRJit.cpp           |   9 +-
 Core/MIPS/IR/IRJit.h             |  11 +-
 Core/MIPS/IR/IRPassSimplify.cpp  |   4 +
 9 files changed, 55 insertions(+), 205 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 7a55c9f8c3d4..078e296c6c9b 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -20,7 +20,6 @@
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
 #include "Core/MIPS/IR/IRJit.h"
-#include "Core/MIPS/IR/IRRegCache.h"
 #include "Common/CPUDetect.h"
 
 using namespace MIPSAnalyst;
@@ -46,21 +45,6 @@ using namespace MIPSAnalyst;
 
 namespace MIPSComp {
 
-void IRJit::CompImmLogic(MIPSGPReg rs, MIPSGPReg rt, u32 uimm, IROp OP) {
-	if (gpr.IsImm(rs)) {
-		switch (OP) {
-		case IROp::AddConst: gpr.SetImm(rt, gpr.GetImm(rs) + uimm); break;
-		case IROp::SubConst: gpr.SetImm(rt, gpr.GetImm(rs) - uimm); break;
-		case IROp::AndConst: gpr.SetImm(rt, gpr.GetImm(rs) & uimm); break;
-		case IROp::OrConst: gpr.SetImm(rt, gpr.GetImm(rs) | uimm); break;
-		case IROp::XorConst: gpr.SetImm(rt, gpr.GetImm(rs) ^ uimm); break;
-		}
-	} else {
-		gpr.MapDirtyIn(rt, rs);
-		ir.Write(OP, rt, rs, ir.AddConstant(uimm));
-	}
-}
-
 void IRJit::Comp_IType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	s32 simm = (s32)(s16)(op & 0xFFFF);  // sign extension
@@ -79,36 +63,26 @@ void IRJit::Comp_IType(MIPSOpcode op) {
 	case 9:	// R(rt) = R(rs) + simm; break;	//addiu
 		// Special-case for small adjustments of pointerified registers. Commonly for SP but happens for others.
 		if (simm >= 0) {
-			CompImmLogic(rs, rt, simm, IROp::AddConst);
+			ir.Write(IROp::AddConst, rt, rs, ir.AddConstant(simm));
 		} else if (simm < 0) {
-			CompImmLogic(rs, rt, -simm, IROp::SubConst);
+			ir.Write(IROp::SubConst, rt, rs, ir.AddConstant(-simm));
 		}
 		break;
 
-	case 12: CompImmLogic(rs, rt, uimm, IROp::AndConst); break;
-	case 13: CompImmLogic(rs, rt, uimm, IROp::OrConst); break;
-	case 14: CompImmLogic(rs, rt, uimm, IROp::XorConst); break;
+	case 12: ir.Write(IROp::AndConst, rt, rs, ir.AddConstant(uimm)); break;
+	case 13: ir.Write(IROp::OrConst, rt, rs, ir.AddConstant(uimm)); break;
+	case 14: ir.Write(IROp::XorConst, rt, rs, ir.AddConstant(uimm)); break;
 
 	case 10: // R(rt) = (s32)R(rs) < simm; break; //slti
-		if (gpr.IsImm(rs)) {
-			gpr.SetImm(rt, (s32)gpr.GetImm(rs) < simm ? 1 : 0);
-			break;
-		}
-		gpr.MapDirtyIn(rt, rs);
 		ir.Write(IROp::SltConst, rt, rs, ir.AddConstant(simm));
 		break;
 
 	case 11: // R(rt) = R(rs) < suimm; break; //sltiu
-		if (gpr.IsImm(rs)) {
-			gpr.SetImm(rt, gpr.GetImm(rs) < suimm ? 1 : 0);
-			break;
-		}
-		gpr.MapDirtyIn(rt, rs);
 		ir.Write(IROp::SltUConst, rt, rs, ir.AddConstant(suimm));
 		break;
 
 	case 15: // R(rt) = uimm << 16;	 //lui
-		gpr.SetImm(rt, uimm << 16);
+		ir.WriteSetConstant(rt, uimm << 16);
 		break;
 
 	default:
@@ -129,11 +103,9 @@ void IRJit::Comp_RType2(MIPSOpcode op) {
 
 	switch (op & 63) {
 	case 22: //clz
-		gpr.MapDirtyIn(rd, rs);
 		ir.Write(IROp::Clz, rd, rs);
 		break;
 	case 23: //clo
-		gpr.MapDirtyIn(rd, rs);
 		ir.Write(IROp::Not, IRTEMP_0, rs);
 		ir.Write(IROp::Clz, rd, IRTEMP_0);
 		break;
@@ -142,25 +114,6 @@ void IRJit::Comp_RType2(MIPSOpcode op) {
 	}
 }
 
-void IRJit::CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp constOp, bool symmetric) {
-	if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-		switch (op) {
-		case IROp::Add: gpr.SetImm(rd, gpr.GetImm(rs) + gpr.GetImm(rt)); break;
-		case IROp::Sub: gpr.SetImm(rd, gpr.GetImm(rs) - gpr.GetImm(rt)); break;
-		case IROp::And: gpr.SetImm(rd, gpr.GetImm(rs) & gpr.GetImm(rt)); break;
-		case IROp::Or: gpr.SetImm(rd, gpr.GetImm(rs) | gpr.GetImm(rt)); break;
-		case IROp::Xor: gpr.SetImm(rd, gpr.GetImm(rs) ^ gpr.GetImm(rt)); break;
-		}
-		return;
-	}
-
-	// Can't do the RSB optimization on ARM64 - no RSB!
-
-	// Generic solution.  If it's an imm, better to flush at this point.
-	gpr.MapDirtyInIn(rd, rs, rt);
-	ir.Write(op, rd, rs, rt);
-}
-
 void IRJit::Comp_RType3(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
@@ -174,83 +127,56 @@ void IRJit::Comp_RType3(MIPSOpcode op) {
 
 	switch (op & 63) {
 	case 10: //if (!R(rt)) R(rd) = R(rs);       break; //movz
-		gpr.MapDirtyInIn(rd, rt, rs);
 		ir.Write(IROp::MovZ, rd, rt, rs);
 		break;
 	case 11:// if (R(rt)) R(rd) = R(rs);		break; //movn
-		gpr.MapDirtyInIn(rd, rt, rs);
 		ir.Write(IROp::MovNZ, rd, rt, rs);
 		break;
 
 	case 32: //R(rd) = R(rs) + R(rt);           break; //add
 	case 33: //R(rd) = R(rs) + R(rt);           break; //addu
-		CompType3(rd, rs, rt, IROp::Add, IROp::AddConst, true);
+		ir.Write(IROp::Add, rd, rs, rt);
 		break;
 
 	case 34: //R(rd) = R(rs) - R(rt);           break; //sub
 	case 35: //R(rd) = R(rs) - R(rt);           break; //subu
-		CompType3(rd, rs, rt, IROp::Sub, IROp::SubConst, false);
+		ir.Write(IROp::Sub, rd, rs, rt);
 		break;
 
 	case 36: //R(rd) = R(rs) & R(rt);           break; //and
-		CompType3(rd, rs, rt, IROp::And, IROp::AndConst, true);
+		ir.Write(IROp::And, rd, rs, rt);
 		break;
 	case 37: //R(rd) = R(rs) | R(rt);           break; //or
-		CompType3(rd, rs, rt, IROp::Or, IROp::OrConst, true);
+		ir.Write(IROp::Or, rd, rs, rt);
 		break;
 	case 38: //R(rd) = R(rs) ^ R(rt);           break; //xor/eor	
-		CompType3(rd, rs, rt, IROp::Xor, IROp::XorConst, true);
+		ir.Write(IROp::Xor, rd, rs, rt);
 		break;
 
 	case 39: // R(rd) = ~(R(rs) | R(rt));       break; //nor
-		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-			gpr.SetImm(rd, ~(gpr.GetImm(rs) | gpr.GetImm(rt)));
+		if (rs == 0) {
+			ir.Write(IROp::Not, rd, rt);
+		} else if (rt == 0) {
+			ir.Write(IROp::Not, rd, rs);
 		} else {
-			gpr.MapDirtyInIn(rd, rs, rt);
-			if (rs == 0) {
-				ir.Write(IROp::Not, rd, rt);
-			} else if (rt == 0) {
-				ir.Write(IROp::Not, rd, rs);
-			} else {
-				ir.Write(IROp::Or, IRTEMP_0, rs, rt);
-				ir.Write(IROp::Not, rd, IRTEMP_0);
-			}
+			ir.Write(IROp::Or, IRTEMP_0, rs, rt);
+			ir.Write(IROp::Not, rd, IRTEMP_0);
 		}
 		break;
 
 	case 42: //R(rd) = (int)R(rs) < (int)R(rt); break; //slt
-		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-			gpr.SetImm(rd, (s32)gpr.GetImm(rs) < (s32)gpr.GetImm(rt));
-		} else {
-			gpr.MapDirtyInIn(rd, rt, rs);
-			ir.Write(IROp::Slt, rd, rs, rt);
-		}
+		ir.Write(IROp::Slt, rd, rs, rt);
 		break;
 
 	case 43: //R(rd) = R(rs) < R(rt);           break; //sltu
-		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-			gpr.SetImm(rd, gpr.GetImm(rs) < gpr.GetImm(rt));
-		} else {
-			gpr.MapDirtyInIn(rd, rt, rs);
-			ir.Write(IROp::SltU, rd, rs, rt);
-		}
+		ir.Write(IROp::SltU, rd, rs, rt);
 		break;
 
 	case 44: //R(rd) = max(R(rs), R(rt);        break; //max
-		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-			gpr.SetImm(rd, std::max(gpr.GetImm(rs), gpr.GetImm(rt)));
-			break;
-		}
-		gpr.MapDirtyInIn(rd, rs, rt);
 		ir.Write(IROp::Max, rd, rs, rt);
 		break;
 
 	case 45: //R(rd) = min(R(rs), R(rt));       break; //min
-		if (gpr.IsImm(rs) && gpr.IsImm(rt)) {
-			gpr.SetImm(rd, std::min(gpr.GetImm(rs), gpr.GetImm(rt)));
-			break;
-		}
-		gpr.MapDirtyInIn(rd, rs, rt);
 		ir.Write(IROp::Min, rd, rs, rt);
 		break;
 
@@ -263,39 +189,13 @@ void IRJit::Comp_RType3(MIPSOpcode op) {
 void IRJit::CompShiftImm(MIPSOpcode op, IROp shiftOpConst, int sa) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
-	if (gpr.IsImm(rt)) {
-		switch (shiftOpConst) {
-		case IROp::ShlImm:
-			gpr.SetImm(rd, gpr.GetImm(rt) << sa);
-			break;
-		case IROp::ShrImm:
-			gpr.SetImm(rd, gpr.GetImm(rt) >> sa);
-			break;
-		case IROp::SarImm:
-			gpr.SetImm(rd, (int)gpr.GetImm(rt) >> sa);
-			break;
-		case IROp::RorImm:
-			gpr.SetImm(rd, (gpr.GetImm(rt) >> sa) | (gpr.GetImm(rt) << (32 - sa)));
-			break;
-		default:
-			DISABLE;
-		}
-	} else {
-		gpr.MapDirtyIn(rd, rt);
-		ir.Write(shiftOpConst, rd, rt, sa);
-	}
+	ir.Write(shiftOpConst, rd, rt, sa);
 }
 
 void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpConst) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
-	if (gpr.IsImm(rs)) {
-		int sa = gpr.GetImm(rs) & 0x1F;
-		CompShiftImm(op, shiftOpConst, sa);
-		return;
-	}
-	gpr.MapDirtyInIn(rd, rs, rt);
 	// Not sure if ARM64 wraps like this so let's do it for it.  (TODO: According to the ARM ARM, it will indeed mask for us so this is not necessary)
 	// ANDI2R(SCRATCH1, gpr.R(rs), 0x1F, INVALID_REG);
 	ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
@@ -343,12 +243,6 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 
 	switch (op & 0x3f) {
 	case 0x0: //ext
-		if (gpr.IsImm(rs)) {
-			gpr.SetImm(rt, (gpr.GetImm(rs) >> pos) & mask);
-			return;
-		}
-
-		gpr.MapDirtyIn(rt, rs);
 		ir.Write(IROp::Shl, rt, rs);
 		ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(mask));
 		break;
@@ -357,25 +251,10 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 	{
 		u32 sourcemask = mask >> pos;
 		u32 destmask = ~(sourcemask << pos);
-		if (gpr.IsImm(rs)) {
-			u32 inserted = (gpr.GetImm(rs) & sourcemask) << pos;
-			if (gpr.IsImm(rt)) {
-				gpr.SetImm(rt, (gpr.GetImm(rt) & destmask) | inserted);
-				return;
-			}
-
-			gpr.MapDirty(rt);
-			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
-			if (inserted != 0) {
-				ir.Write(IROp::OrConst, rt, rt, inserted);
-			}
-		} else {
-			gpr.MapDirtyIn(rt, rs);
-			ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(sourcemask));
-			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
-			ir.Write(IROp::ShlImm, IRTEMP_0, IRTEMP_0, pos);
-			ir.Write(IROp::Or, rt, rt, IRTEMP_0);
-		}
+		ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(sourcemask));
+		ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
+		ir.Write(IROp::ShlImm, IRTEMP_0, IRTEMP_0, pos);
+		ir.Write(IROp::Or, rt, rt, IRTEMP_0);
 	}
 	break;
 	}
@@ -391,20 +270,10 @@ void IRJit::Comp_Allegrex(MIPSOpcode op) {
 
 	switch ((op >> 6) & 31) {
 	case 16: // seb	// R(rd) = (u32)(s32)(s8)(u8)R(rt);
-		if (gpr.IsImm(rt)) {
-			gpr.SetImm(rd, (s32)(s8)(u8)gpr.GetImm(rt));
-			return;
-		}
-		gpr.MapDirtyIn(rd, rt);
 		ir.Write(IROp::Ext8to32, rd, rt);
 		break;
 
 	case 24: // seh
-		if (gpr.IsImm(rt)) {
-			gpr.SetImm(rd, (s32)(s16)(u16)gpr.GetImm(rt));
-			return;
-		}
-		gpr.MapDirtyIn(rd, rt);
 		ir.Write(IROp::Ext16to32, rd, rt);
 		break;
 
@@ -425,20 +294,10 @@ void IRJit::Comp_Allegrex2(MIPSOpcode op) {
 
 	switch (op & 0x3ff) {
 	case 0xA0: //wsbh
-		if (gpr.IsImm(rt)) {
-			gpr.SetImm(rd, ((gpr.GetImm(rt) & 0xFF00FF00) >> 8) | ((gpr.GetImm(rt) & 0x00FF00FF) << 8));
-		} else {
-			gpr.MapDirtyIn(rd, rt);
-			ir.Write(IROp::BSwap16, rd, rt);
-		}
+		ir.Write(IROp::BSwap16, rd, rt);
 		break;
 	case 0xE0: //wsbw
-		if (gpr.IsImm(rt)) {
-			gpr.SetImm(rd, swap32(gpr.GetImm(rt)));
-		} else {
-			gpr.MapDirtyIn(rd, rt);
-			ir.Write(IROp::BSwap16, rd, rt);
-		}
+		ir.Write(IROp::BSwap16, rd, rt);
 		break;
 	default:
 		Comp_Generic(op);
@@ -456,25 +315,21 @@ void IRJit::Comp_MulDivType(MIPSOpcode op) {
 	switch (op & 63) {
 	case 16: // R(rd) = HI; //mfhi
 		if (rd != MIPS_REG_ZERO) {
-			gpr.MapDirty(rd);
 			ir.Write(IROp::MfHi, rd);
 		}
 		break;
 
 	case 17: // HI = R(rs); //mthi
-		gpr.MapIn(rs);
 		ir.Write(IROp::MtHi, 0, rs);
 		break;
 
 	case 18: // R(rd) = LO; break; //mflo
 		if (rd != MIPS_REG_ZERO) {
-			gpr.MapDirty(rd);
 			ir.Write(IROp::MfLo, rd);
 		}
 		break;
 
 	case 19: // LO = R(rs); break; //mtlo
-		gpr.MapIn(rs);
 		ir.Write(IROp::MtLo, 0, rs);
 		break;
 
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 7a4601838f4a..9d69b282c212 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -29,7 +29,6 @@
 #include "Core/MIPS/MIPSTables.h"
 
 #include "Core/MIPS/IR/IRJit.h"
-#include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 
 #include "Common/Arm64Emitter.h"
@@ -74,12 +73,10 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 	MIPSGPReg rhs = rt;
 	if (!delaySlotIsNice) {  // if likely, we don't need this
 		if (rs != 0) {
-			gpr.MapIn(rs);
 			ir.Write(IROp::Mov, IRTEMP_0, rs);
 			lhs = (MIPSGPReg)IRTEMP_0;
 		}
 		if (rt != 0) {
-			gpr.MapIn(rt);
 			ir.Write(IROp::Mov, IRTEMP_1, rt);
 			rhs = (MIPSGPReg)IRTEMP_1;
 		}
@@ -88,7 +85,6 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 	if (!likely)
 		CompileDelaySlot();
 
-	gpr.MapInIn(lhs, rhs);
 	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs, rhs);
 	// This makes the block "impure" :(
@@ -121,12 +117,11 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 		lhs = (MIPSGPReg)IRTEMP_0;
 	}
 	if (andLink)
-		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
+		ir.WriteSetConstant(MIPS_REG_RA, GetCompilerPC() + 8);
 
 	if (!likely)
 		CompileDelaySlot();
 
-	gpr.MapIn(lhs);
 	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), lhs);
 	if (likely)
@@ -294,7 +289,7 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 		break;
 
 	case 3: //jal
-		gpr.SetImm(MIPS_REG_RA, GetCompilerPC() + 8);
+		ir.WriteSetConstant(MIPS_REG_RA, GetCompilerPC() + 8);
 		CompileDelaySlot();
 		FlushAll();
 		ir.Write(IROp::ExitToConst, ir.AddConstant(targetAddr));
@@ -325,27 +320,24 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 
 	int destReg;
 	if (IsSyscall(delaySlotOp)) {
-		gpr.MapDirty(rs);
 		ir.Write(IROp::SetPC, 0, rs);
 		if (andLink)
-			gpr.SetImm(rd, GetCompilerPC() + 8);
+			ir.WriteSetConstant(rd, GetCompilerPC() + 8);
 		CompileDelaySlot();
 		// Syscall (the delay slot) does FlushAll.
 		return;  // Syscall (delay slot) wrote exit code.
 	} else if (delaySlotIsNice) {
 		if (andLink)
-			gpr.SetImm(rd, GetCompilerPC() + 8);
+			ir.WriteSetConstant(rd, GetCompilerPC() + 8);
 		CompileDelaySlot();
-		gpr.MapDirty(rs);
 		destReg = rs;  // Safe because FlushAll doesn't change any regs
 		FlushAll();
 	} else {
 		// Bad delay slot.
-		gpr.MapDirty(rs);
 		ir.Write(IROp::Mov, IRTEMP_0, rs);
 		destReg = IRTEMP_0;
 		if (andLink)
-			gpr.SetImm(rd, GetCompilerPC() + 8);
+			ir.WriteSetConstant(rd, GetCompilerPC() + 8);
 		CompileDelaySlot();
 		FlushAll();
 	}
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 8d9d7b2aa5f2..86e795e50be7 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -82,7 +82,7 @@ void IRJit::Comp_FPUComp(MIPSOpcode op) {
 	int opc = op & 0xF;
 	if (opc >= 8) opc -= 8; // alias
 	if (opc == 0) {  // f, sf (signalling false)
-		gpr.SetImm((MIPSGPReg)IRREG_FPCOND, 0);
+		ir.Write(IROp::ZeroFpCond);
 		return;
 	}
 
@@ -186,7 +186,6 @@ void IRJit::Comp_mxc1(MIPSOpcode op)
 		if (rt == MIPS_REG_ZERO) {
 			return;
 		}
-		gpr.MapDirty(rt);
 		ir.Write(IROp::FMovToGPR, rt, fs);
 		return;
 
@@ -196,16 +195,16 @@ void IRJit::Comp_mxc1(MIPSOpcode op)
 		}
 		if (fs == 31) {
 			DISABLE;
-		} else if (fs == 0) {
-			gpr.SetImm(rt, MIPSState::FCR0_VALUE);
+		}
+		else if (fs == 0) {
+			ir.Write(IROp::SetConst, rt, ir.AddConstant(MIPSState::FCR0_VALUE));
 		} else {
 			// Unsupported regs are always 0.
-			gpr.SetImm(rt, 0);
+			ir.Write(IROp::SetConst, rt, ir.AddConstant(0));
 		}
 		return;
 
 	case 4: //FI(fs) = R(rt);	break; //mtc1
-		gpr.MapDirty(rt);
 		ir.Write(IROp::FMovFromGPR, fs, rt);
 		return;
 
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
index fb0a143dd8a6..4e702a544f2a 100644
--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -82,8 +82,6 @@ namespace MIPSComp {
 			return;
 		}
 
-		gpr.MapIn(rs);
-		gpr.MapDirty(rt);
 		int addrReg = IRTEMP_0;
 		switch (o) {
 			// Load
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 460bca7c9052..d05e10bf8daf 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -372,6 +372,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			}
 			break; //cvt.w.s
 		}
+		case IROp::ZeroFpCond:
+			mips->fpcond = 0;
+			break;
 
 		case IROp::FMovFromGPR:
 			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
@@ -481,6 +484,10 @@ int IRWriter::AddConstant(u32 value) {
 			return (int)i;
 	}
 	constPool_.push_back(value);
+	if (constPool_.size() > 255) {
+		// Cannot have more than 256 constants in a block!
+		Crash();
+	}
 	return (int)constPool_.size() - 1;
 }
 
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index d3fecabe69b1..c8680d1ad0f7 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -121,6 +121,7 @@ enum class IROp : u8 {
 	FpCondToReg,
 	VfpCondToReg,
 
+	ZeroFpCond,
 	FCmpUnordered,
 	FCmpEqual,
 	FCmpEqualUnordered,
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 2f88b7740f1c..1b4a9a59c54e 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -41,14 +41,14 @@
 namespace MIPSComp
 {
 
-IRJit::IRJit(MIPSState *mips) : gpr(), mips_(mips) { 
+IRJit::IRJit(MIPSState *mips) : mips_(mips) { 
 	logBlocks = 0;
 	dontLogBlocks = 0;
 	js.startDefaultPrefix = true;
 	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 	u32 size = 128 * 1024;
 	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
-	logBlocks = 0;
+	logBlocks = 100;
 	InitIR();
 }
 
@@ -88,7 +88,7 @@ void IRJit::DoDummyState(PointerWrap &p) {
 }
 
 void IRJit::FlushAll() {
-	gpr.FlushAll();
+	// gpr.FlushAll();
 	// FlushPrefixV();
 }
 
@@ -246,8 +246,6 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	js.PrefixStart();
 	ir.Clear();
 
-	gpr.Start(&ir);
-
 	int partialFlushOffset = 0;
 
 	js.numInstructions = 0;
@@ -273,7 +271,6 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		}
 	}
 
-
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		ILOG("=============== IR (%d instructions) ===============", js.numInstructions);
 		for (int i = 0; i < ir.GetInstructions().size(); i++) {
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 63badb5ce2cf..13e9162fa71c 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -84,16 +84,18 @@ class IRBlockCache {
 	int GetNumBlocks() const { return (int)blocks_.size(); }
 	int AllocateBlock(int emAddr) {
 		blocks_.emplace_back(IRBlock(emAddr));
+		size_ = (int)blocks_.size();
 		return (int)blocks_.size() - 1;
 	}
 	IRBlock *GetBlock(int i) {
-		if (i >= 0 && i < blocks_.size()) {
-			return &blocks_[i];
+		if (i >= 0 && i < size_) {
+			return blocks_.data() + i;
 		} else {
 			return nullptr;
 		}
 	}
 private:
+	int size_;
 	std::vector<IRBlock> blocks_;
 };
 
@@ -231,8 +233,6 @@ class IRJit : public JitInterface {
 	void BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely);
 
 	// Utilities to reduce duplicated code
-	void CompImmLogic(MIPSGPReg rs, MIPSGPReg rt, u32 uimm, IROp op);
-	void CompType3(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, IROp op, IROp constOp, bool symmetric = false);
 	void CompShiftImm(MIPSOpcode op, IROp shiftType, int sa);
 	void CompShiftVar(MIPSOpcode op, IROp shiftType, IROp shiftTypeConst);
 
@@ -258,9 +258,6 @@ class IRJit : public JitInterface {
 
 	IRBlockCache blocks_;
 
-	IRRegCache gpr;
-	// Arm64RegCacheFPU fpr;
-
 	MIPSState *mips_;
 
 	int dontLogBlocks;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index e110b7380874..6082f261fbf0 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -6,6 +6,10 @@ void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool) {
 		case IROp::AddConst:
 			if (constPool[inst[i].src2] == 0)
 				inst[i].op = IROp::Mov;
+			else if (inst[i].src1 == 0) {
+				inst[i].op = IROp::SetConst;
+				inst[i].src1 = inst[i].src2;
+			}
 			break;
 		default:
 			break;

From 09969c0156162fdf9b4cbf8dcaacbd9dad19706f Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 01:06:07 +0200
Subject: [PATCH 08/77] Use the regcache in a new (incomplete) pass,
 PropagateConstants.

---
 Core/MIPS/IR/IRCompALU.cpp      |   7 +-
 Core/MIPS/IR/IRCompFPU.cpp      |   1 +
 Core/MIPS/IR/IRInst.cpp         |  20 +++--
 Core/MIPS/IR/IRInst.h           |   8 +-
 Core/MIPS/IR/IRJit.cpp          |  30 +++++--
 Core/MIPS/IR/IRPassSimplify.cpp | 140 ++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRPassSimplify.h   |   3 +
 Core/MIPS/IR/IRRegCache.cpp     |  52 ++++++------
 Core/MIPS/IR/IRRegCache.h       |  23 +++---
 9 files changed, 230 insertions(+), 54 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 078e296c6c9b..d68150b17842 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -61,12 +61,7 @@ void IRJit::Comp_IType(MIPSOpcode op) {
 	switch (op >> 26) {
 	case 8:	// same as addiu?
 	case 9:	// R(rt) = R(rs) + simm; break;	//addiu
-		// Special-case for small adjustments of pointerified registers. Commonly for SP but happens for others.
-		if (simm >= 0) {
-			ir.Write(IROp::AddConst, rt, rs, ir.AddConstant(simm));
-		} else if (simm < 0) {
-			ir.Write(IROp::SubConst, rt, rs, ir.AddConstant(-simm));
-		}
+		ir.Write(IROp::AddConst, rt, rs, ir.AddConstant(simm));
 		break;
 
 	case 12: ir.Write(IROp::AndConst, rt, rs, ir.AddConstant(uimm)); break;
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 86e795e50be7..f94bddbaad7a 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -121,6 +121,7 @@ void IRJit::Comp_FPUComp(MIPSOpcode op) {
 
 void IRJit::Comp_FPU2op(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
+
 	int fs = _FS;
 	int fd = _FD;
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index d05e10bf8daf..b4eb14d98f5e 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -9,7 +9,7 @@
 #include "math/math_util.h"
 
 static const IRMeta irMeta[] = {
-	{ IROp::SetConst, "SetConst", "GC_" },
+	{ IROp::SetConst, "SetConst", "GC" },
 	{ IROp::Mov, "Mov", "GG" },
 	{ IROp::Add, "Add", "GGG" },
 	{ IROp::Sub, "Sub", "GGG" },
@@ -81,7 +81,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
-	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "T" },
+	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
 	{ IROp::ExitToConst, "Exit", "C" },
@@ -94,7 +94,9 @@ static const IRMeta irMeta[] = {
 	{ IROp::ExitToReg, "ExitToReg", "G" },
 	{ IROp::Syscall, "Syscall", "_C" },
 	{ IROp::Break, "Break", ""},
-	{ IROp::SetPC, "SetPC", "_G"},
+	{ IROp::SetPC, "SetPC", "_G" },
+	{ IROp::SetPCConst, "SetPC", "_C" },
+	{ IROp::CallReplacement, "CallRepl", "_C"},
 };
 
 const IRMeta *metaIndex[256];
@@ -454,6 +456,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			Crash();
 			break;
 
+		case IROp::SetCtrlVFPU:
+			mips->vfpuCtrl[inst->dest] = constPool[inst->src1];
+			break;
+
 		default:
 			Crash();
 		}
@@ -498,7 +504,7 @@ int IRWriter::AddConstantFloat(float value) {
 }
 
 void IRWriter::Simplify() {
-	SimplifyInPlace(&insts_[0], insts_.size(), constPool_.data());
+	SimplifyInPlace(&insts_[0], (int)insts_.size(), constPool_.data());
 }
 
 const char *GetGPRName(int r) {
@@ -536,8 +542,12 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	}
 }
 
+const IRMeta *GetIRMeta(IROp op) {
+	return metaIndex[(int)op];
+}
+
 void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool) {
-	const IRMeta *meta = metaIndex[(int)inst.op];
+	const IRMeta *meta = GetIRMeta(inst.op);
 	if (!meta) {
 		snprintf(buf, bufsize, "Unknown %d", (int)inst.op);
 		return;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index c8680d1ad0f7..2c6ab75cd756 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -249,6 +249,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 class IRWriter {
 public:
 	void Write(IROp op, u8 dst = 0, u8 src1 = 0, u8 src2 = 0);
+	void Write(IRInst inst) {
+		insts_.push_back(inst);
+	}
 	void WriteSetConstant(u8 dst, u32 value);
 
 	int AddConstant(u32 value);
@@ -261,13 +264,14 @@ class IRWriter {
 
 	void Simplify();
 
-	const std::vector<IRInst> &GetInstructions() { return insts_; }
-	const std::vector<u32> &GetConstants() { return constPool_; }
+	const std::vector<IRInst> &GetInstructions() const { return insts_; }
+	const std::vector<u32> &GetConstants() const { return constPool_; }
 
 private:
 	std::vector<IRInst> insts_;
 	std::vector<u32> constPool_;
 };
 
+const IRMeta *GetIRMeta(IROp op);
 void DisassembleIR(char *buf, size_t bufsize, IRInst inst, const u32 *constPool);
 void InitIR();
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 1b4a9a59c54e..d393bf78ca70 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -36,6 +36,7 @@
 #include "Core/HLE/sceKernelMemory.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 
 namespace MIPSComp
@@ -44,11 +45,11 @@ namespace MIPSComp
 IRJit::IRJit(MIPSState *mips) : mips_(mips) { 
 	logBlocks = 0;
 	dontLogBlocks = 0;
-	js.startDefaultPrefix = true;
+	js.startDefaultPrefix = mips_->HasDefaultPrefix();
 	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 	u32 size = 128 * 1024;
 	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
-	logBlocks = 100;
+	logBlocks = 12;
 	InitIR();
 }
 
@@ -88,8 +89,7 @@ void IRJit::DoDummyState(PointerWrap &p) {
 }
 
 void IRJit::FlushAll() {
-	// gpr.FlushAll();
-	// FlushPrefixV();
+	FlushPrefixV();
 }
 
 void IRJit::FlushPrefixV() {
@@ -259,7 +259,15 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 
 	ir.Simplify();
 
-	b->SetInstructions(ir.GetInstructions(), ir.GetConstants());
+	IRWriter simplified;
+
+	IRWriter *code = &ir;
+	if (true) {
+		PropagateConstants(ir, simplified);
+		code = &simplified;
+	}
+
+	b->SetInstructions(code->GetInstructions(), code->GetConstants());
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		char temp2[256];
@@ -272,7 +280,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	}
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
-		ILOG("=============== IR (%d instructions) ===============", js.numInstructions);
+		ILOG("=============== Original IR (%d instructions) ===============", (int)ir.GetInstructions().size());
 		for (int i = 0; i < ir.GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
@@ -281,6 +289,16 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		ILOG("===============        end         =================");
 	}
 
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		ILOG("=============== IR (%d instructions) ===============", (int)code->GetInstructions().size());
+		for (int i = 0; i < code->GetInstructions().size(); i++) {
+			char buf[256];
+			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
+			ILOG("%s", buf);
+		}
+		ILOG("===============        end         =================");
+	}
+
 	if (logBlocks > 0)
 		logBlocks--;
 	if (dontLogBlocks > 0)
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 6082f261fbf0..38141951de10 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -1,4 +1,5 @@
 #include "Core/MIPS/IR/IRPassSimplify.h"
+#include "Core/MIPS/IR/IRRegCache.h"
 
 void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool) {
 	for (int i = 0; i < count; i++) {
@@ -15,4 +16,143 @@ void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool) {
 			break;
 		}
 	}
+}
+
+
+u32 Evaluate(u32 a, u32 b, IROp op) {
+	switch (op) {
+	case IROp::Add: case IROp::AddConst: return a + b;
+	case IROp::Sub: case IROp::SubConst: return a - b;
+	case IROp::And: case IROp::AndConst: return a & b;
+	case IROp::Or: case IROp::OrConst: return a | b;
+	case IROp::Xor: case IROp::XorConst: return a ^ b;
+	default:
+		return -1;
+	}
+}
+
+IROp ArithToArithConst(IROp op) {
+	switch (op) {
+	case IROp::Add: return IROp::AddConst;
+	case IROp::Sub: return IROp::SubConst;
+	case IROp::And: return IROp::AndConst;
+	case IROp::Or: return IROp::OrConst;
+	case IROp::Xor: return IROp::XorConst;
+	default:
+		return (IROp)-1;
+	}
+}
+
+
+void PropagateConstants(const IRWriter &in, IRWriter &out) {
+	IRRegCache gpr(&out);
+
+	const u32 *constants = in.GetConstants().data();
+	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
+		IRInst inst = in.GetInstructions()[i];
+		bool symmetric = true;
+		switch (inst.op) {
+		case IROp::SetConst:
+			gpr.SetImm((MIPSGPReg)inst.dest, constants[inst.src1]);
+			break;
+
+		case IROp::Sub:
+			symmetric = false;  // fallthrough
+		case IROp::Add:
+		case IROp::And:
+		case IROp::Or:
+		case IROp::Xor:
+			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
+				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
+			} else if (gpr.IsImm(inst.src2) && inst.src1 != inst.src2 && inst.dest != inst.src2) {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				if (gpr.GetImm(inst.src2) == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
+					out.Write(IROp::Mov, inst.dest, inst.src1);
+				} else {
+					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(gpr.GetImm(inst.src2)));
+				}
+			} else if (gpr.IsImm(inst.src1) && inst.src1 != inst.src2 && inst.dest != inst.src2 && symmetric) {
+				gpr.MapDirtyIn(inst.dest, inst.src2);
+				out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(gpr.GetImm(inst.src1)));
+			} else {
+				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+				goto doDefault;
+			}
+			break;
+		
+		case IROp::AddConst:
+		case IROp::SubConst:
+		case IROp::AndConst:
+		case IROp::OrConst:
+		case IROp::XorConst:
+			if (gpr.IsImm(inst.src1)) {
+				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), constants[inst.src2], inst.op));
+			} else {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
+
+		case IROp::Mov:
+			if (inst.src1 == inst.src2) {
+				// Nop
+			} else if (gpr.IsImm(inst.src1)) {
+				gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
+			} else {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
+
+		case IROp::Store8:
+		case IROp::Store16:
+		case IROp::Store32:
+			// Just pass through, no excessive flushing
+			gpr.MapInIn(inst.dest, inst.src1);
+			goto doDefault;
+
+		case IROp::Load8:
+		case IROp::Load8Ext:
+		case IROp::Load16:
+		case IROp::Load16Ext:
+		case IROp::Load32:
+			gpr.MapDirtyIn(inst.dest, inst.src1);
+			goto doDefault;
+
+		case IROp::ExitToConst:
+		case IROp::ExitToReg:
+		case IROp::ExitToConstIfEq:
+		case IROp::ExitToConstIfNeq:
+		case IROp::ExitToConstIfFpFalse:
+		case IROp::ExitToConstIfFpTrue:
+		case IROp::ExitToConstIfGeZ:
+		case IROp::ExitToConstIfGtZ:
+		case IROp::ExitToConstIfLeZ:
+		case IROp::ExitToConstIfLtZ:
+		default:
+		{
+			gpr.FlushAll();
+		doDefault:
+			// Remap constants to the new reality
+			const IRMeta *m = GetIRMeta(inst.op);
+			switch (m->types[0]) {
+			case 'C':
+				inst.dest = out.AddConstant(constants[inst.dest]);
+				break;
+			}
+			switch (m->types[1]) {
+			case 'C':
+				inst.src1 = out.AddConstant(constants[inst.src1]);
+				break;
+			}
+			switch (m->types[2]) {
+			case 'C':
+				inst.src2 = out.AddConstant(constants[inst.src2]);
+				break;
+			}
+			out.Write(inst);
+			break;
+		}
+		}
+	}
 }
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index 8706661704b6..b5d0af1e95d1 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -4,3 +4,6 @@
 
 // Dumb example of a simplification pass that can't add or remove instructions.
 void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);
+
+
+void PropagateConstants(const IRWriter &in, IRWriter &out);
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index 808370ce6321..f1c020139579 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -1,7 +1,7 @@
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRInst.h"
 
-void IRRegCache::Dirty(MIPSGPReg rd) {
+void IRRegCache::Flush(int rd) {
 	if (rd == 0) {
 		return;
 	}
@@ -11,38 +11,42 @@ void IRRegCache::Dirty(MIPSGPReg rd) {
 	}
 }
 
-void IRRegCache::MapIn(MIPSGPReg rd) {
-	Dirty(rd);
-}
-
-void IRRegCache::MapInIn(MIPSGPReg rs, MIPSGPReg rt) {
-	Dirty(rs);
-	Dirty(rt);
+void IRRegCache::Discard(int rd) {
+	if (rd == 0) {
+		return;
+	}
+	reg_[rd].isImm = false;
 }
 
-void IRRegCache::MapDirty(MIPSGPReg rd) {
-	Dirty(rd);
+IRRegCache::IRRegCache(IRWriter *ir) : ir_(ir) {
+	memset(&reg_, 0, sizeof(reg_));
+	reg_[0].isImm = true;
+	ir_ = ir;
 }
 
-void IRRegCache::MapDirtyIn(MIPSGPReg rd, MIPSGPReg rs) {
-	Dirty(rd);
-	Dirty(rs);
+void IRRegCache::FlushAll() {
+	for (int i = 0; i < TOTAL_MAPPABLE_MIPSREGS; i++) {
+		Flush(i);
+	}
 }
 
-void IRRegCache::MapDirtyInIn(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt) {
-	Dirty(rd);
-	Dirty(rs);
-	Dirty(rt);
+void IRRegCache::MapInIn(int rs, int rt) {
+	Flush(rs);
+	Flush(rt);
 }
 
-void IRRegCache::Start(IRWriter *ir) {
-	memset(&reg_, 0, sizeof(reg_));
-	reg_[0].isImm = true;
-	ir_ = ir;
+void IRRegCache::MapDirtyIn(int rd, int rs) {
+	if (rs != rd) {
+		Discard(rd);
+	}
+	Flush(rs);
 }
 
-void IRRegCache::FlushAll() {
-	for (int i = 0; i < TOTAL_MAPPABLE_MIPSREGS; i++) {
-		Dirty((MIPSGPReg)i);
+void IRRegCache::MapDirtyInIn(int rd, int rs, int rt) {
+	if (rs != rd && rt != rd) {
+		Discard(rd);
 	}
+	Flush(rs);
+	Flush(rt);
 }
+
diff --git a/Core/MIPS/IR/IRRegCache.h b/Core/MIPS/IR/IRRegCache.h
index bf53e2a818f0..1d7e78f7a888 100644
--- a/Core/MIPS/IR/IRRegCache.h
+++ b/Core/MIPS/IR/IRRegCache.h
@@ -17,27 +17,28 @@ struct RegIR {
 
 class IRWriter;
 
+// Transient
 class IRRegCache {
 public:
-	void SetImm(MIPSGPReg r, u32 immVal) {
+	IRRegCache(IRWriter *ir);
+
+	void SetImm(int r, u32 immVal) {
 		reg_[r].isImm = true;
 		reg_[r].immVal = immVal;
 	}
 
-	bool IsImm(MIPSGPReg r) const { return reg_[r].isImm; }
-	u32 GetImm(MIPSGPReg r) const { return reg_[r].immVal; }
-
-	void MapIn(MIPSGPReg rd);
-	void MapInIn(MIPSGPReg rs, MIPSGPReg rt);
-	void MapDirty(MIPSGPReg rd);
-	void MapDirtyIn(MIPSGPReg rd, MIPSGPReg rs);
-	void MapDirtyInIn(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt);
+	bool IsImm(int r) const { return reg_[r].isImm; }
+	u32 GetImm(int r) const { return reg_[r].immVal; }
 
-	void Start(IRWriter *ir);
 	void FlushAll();
 
+	void MapInIn(int rs, int rt);
+	void MapDirtyIn(int rd, int rs);
+	void MapDirtyInIn(int rd, int rs, int rt);
+
 private:
-	void Dirty(MIPSGPReg rd);
+	void Flush(int rd);
+	void Discard(int rd);
 	RegIR reg_[TOTAL_MAPPABLE_MIPSREGS];
 	IRWriter *ir_;
 };

From ce8aae5ed1eca2e2e10f0ffaaa2303b8e83a6d34 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 01:43:27 +0200
Subject: [PATCH 09/77] Make the IRJit core selectable in developer tools

---
 Core/Config.cpp                   | 28 +++++++++++++++++++---------
 Core/Config.h                     |  7 ++++++-
 Core/CoreParameter.h              |  8 ++------
 Core/MIPS/IR/IRCompBranch.cpp     |  2 +-
 Core/MIPS/IR/IRInst.cpp           |  4 ++++
 Core/MIPS/IR/IRInst.h             |  2 +-
 Core/MIPS/IR/IRJit.cpp            |  6 ++++++
 Core/MIPS/JitCommon/JitCommon.cpp |  4 ----
 Core/MIPS/MIPS.cpp                | 27 ++++++++++++++++++++-------
 Core/MemMapFunctions.cpp          |  4 ++--
 UI/EmuScreen.cpp                  |  4 ++--
 UI/GameSettingsScreen.cpp         |  7 +++++--
 UI/MiscScreens.cpp                |  2 +-
 UI/NativeApp.cpp                  |  8 ++++++--
 android/jni/TestRunner.cpp        |  2 +-
 headless/Headless.cpp             | 10 ++++++----
 unittest/JitHarness.cpp           |  4 ++--
 17 files changed, 84 insertions(+), 45 deletions(-)

diff --git a/Core/Config.cpp b/Core/Config.cpp
index 6faa97a0759b..137194c47571 100644
--- a/Core/Config.cpp
+++ b/Core/Config.cpp
@@ -282,9 +282,20 @@ static int DefaultNumWorkers() {
 	return cpu_info.num_cores;
 }
 
-static bool DefaultJit() {
+// TODO: Default to IRJit on iOS when it's done.
+static int DefaultCpuCore() {
 #ifdef IOS
-	return iosCanUseJit;
+	return iosCanUseJit ? CPU_CORE_JIT : CPU_CORE_INTERPRETER;
+#elif defined(ARM) || defined(ARM64) || defined(_M_IX86) || defined(_M_X64)
+	return CPU_CORE_JIT;
+#else
+	return CPU_CORE_INTERPRETER;
+#endif
+}
+
+static bool DefaultCodeGen() {
+#ifdef IOS
+	return iosCanUseJit ? true : false;
 #elif defined(ARM) || defined(ARM64) || defined(_M_IX86) || defined(_M_X64)
 	return true;
 #else
@@ -353,8 +364,7 @@ static bool DefaultSasThread() {
 }
 
 static ConfigSetting cpuSettings[] = {
-	ReportedConfigSetting("Jit", &g_Config.bJit, &DefaultJit, true, true),
-	ReportedConfigSetting("CPUCore", &g_Config.bJit, &DefaultJit, true, true),
+	ReportedConfigSetting("CPUCore", &g_Config.iCpuCore, &DefaultCpuCore, true, true),
 	ReportedConfigSetting("SeparateCPUThread", &g_Config.bSeparateCPUThread, false, true, true),
 	ReportedConfigSetting("SeparateSASThread", &g_Config.bSeparateSASThread, &DefaultSasThread, true, true),
 	ReportedConfigSetting("SeparateIOThread", &g_Config.bSeparateIOThread, true, true, true),
@@ -464,7 +474,7 @@ static ConfigSetting graphicsSettings[] = {
 	ReportedConfigSetting("VertexCache", &g_Config.bVertexCache, true, true, true),
 	ReportedConfigSetting("TextureBackoffCache", &g_Config.bTextureBackoffCache, false, true, true),
 	ReportedConfigSetting("TextureSecondaryCache", &g_Config.bTextureSecondaryCache, false, true, true),
-	ReportedConfigSetting("VertexDecJit", &g_Config.bVertexDecoderJit, &DefaultJit, false),
+	ReportedConfigSetting("VertexDecJit", &g_Config.bVertexDecoderJit, &DefaultCodeGen, false),
 
 #ifndef MOBILE_DEVICE
 	ConfigSetting("FullScreen", &g_Config.bFullScreen, false),
@@ -959,16 +969,16 @@ void Config::Load(const char *iniFileName, const char *controllerIniFilename) {
 	}
 
 	// Override ppsspp.ini JIT value to prevent crashing
-	if (!DefaultJit() && g_Config.bJit) {
+	if (DefaultCpuCore() != CPU_CORE_JIT && g_Config.iCpuCore == CPU_CORE_JIT) {
 		jitForcedOff = true;
-		g_Config.bJit = false;
+		g_Config.iCpuCore = CPU_CORE_INTERPRETER;
 	}
 }
 
 void Config::Save() {
 	if (jitForcedOff) {
 		// if JIT has been forced off, we don't want to screw up the user's ppsspp.ini
-		g_Config.bJit = true;
+		g_Config.iCpuCore = CPU_CORE_JIT;
 	}
 	if (iniFilename_.size() && g_Config.bSaveSettings) {
 		
@@ -1037,7 +1047,7 @@ void Config::Save() {
 	}
 	if (jitForcedOff) {
 		// force JIT off again just in case Config::Save() is called without exiting PPSSPP
-		g_Config.bJit = false;
+		g_Config.iCpuCore = CPU_CORE_INTERPRETER;
 	}
 }
 
diff --git a/Core/Config.h b/Core/Config.h
index 825091bd0cc6..6a2016997b0c 100644
--- a/Core/Config.h
+++ b/Core/Config.h
@@ -33,6 +33,12 @@ const int PSP_DEFAULT_FIRMWARE = 150;
 static const s8 VOLUME_OFF = 0;
 static const s8 VOLUME_MAX = 10;
 
+enum CPUCore {
+	CPU_CORE_INTERPRETER = 0,
+	CPU_CORE_JIT = 1,
+	CPU_CORE_IRJIT = 2,
+};
+
 enum {
 	ROTATION_AUTO = 0,
 	ROTATION_LOCKED_HORIZONTAL = 1,
@@ -119,7 +125,6 @@ struct Config {
 	// Core
 	bool bIgnoreBadMemAccess;
 	bool bFastMemory;
-	bool bJit;
 	int iCpuCore;
 	bool bCheckForNewVersion;
 	bool bForceLagSync;
diff --git a/Core/CoreParameter.h b/Core/CoreParameter.h
index 1517b50c03c1..c9351443c783 100644
--- a/Core/CoreParameter.h
+++ b/Core/CoreParameter.h
@@ -20,12 +20,7 @@
 #include <string>
 
 #include "Core/Compatibility.h"
-
-enum CPUCore {
-	CPU_INTERPRETER,
-	CPU_JIT,
-	CPU_IRJIT,
-};
+#include "Core/Config.h"
 
 enum GPUCore {
 	GPUCORE_NULL,
@@ -47,6 +42,7 @@ struct CoreParameter {
 
 	CPUCore cpuCore;
 	GPUCore gpuCore;
+
 	GraphicsContext *graphicsContext;  // TODO: Find a better place.
 	Thin3DContext *thin3d;
 	bool enableSound;  // there aren't multiple sound cores.
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 9d69b282c212..2b478f695a01 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -219,7 +219,7 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 
-	ir.Write(IROp::VfpCondToReg, IRTEMP_0);
+	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, VFPU_CTRL_CC);
 
 	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index b4eb14d98f5e..cba03ae7957b 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -81,6 +81,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
+	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
@@ -329,6 +330,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::FpCondToReg:
 			mips->r[inst->dest] = mips->fpcond;
 			break;
+		case IROp::VfpuCtrlToReg:
+			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
+			break;
 		case IROp::FRound:
 			mips->r[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
 			break;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 2c6ab75cd756..e044825f1ccb 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -119,7 +119,7 @@ enum class IROp : u8 {
 	FMovToGPR,
 
 	FpCondToReg,
-	VfpCondToReg,
+	VfpuCtrlToReg,
 
 	ZeroFpCond,
 	FCmpUnordered,
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index d393bf78ca70..0d8fca504b14 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -255,6 +255,12 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		MIPSCompileOp(inst, this);
 		js.compilerPC += 4;
 		js.numInstructions++;
+
+		if (ir.GetConstants().size() > 128) {
+			// Need to break the block
+			ir.Write(IROp::ExitToConst, ir.AddConstant(js.compilerPC));
+			js.compiling = false;
+		}
 	}
 
 	ir.Simplify();
diff --git a/Core/MIPS/JitCommon/JitCommon.cpp b/Core/MIPS/JitCommon/JitCommon.cpp
index 630494f97329..e267b9352ed1 100644
--- a/Core/MIPS/JitCommon/JitCommon.cpp
+++ b/Core/MIPS/JitCommon/JitCommon.cpp
@@ -47,9 +47,6 @@ namespace MIPSComp {
 	}
 
 	JitInterface *CreateNativeJit(MIPSState *mips) {
-#if 1
-		return new MIPSComp::IRJit(mips);
-#else
 #if defined(ARM)
 		return new MIPSComp::ArmJit(mips);
 #elif defined(ARM64)
@@ -60,7 +57,6 @@ namespace MIPSComp {
 		return new MIPSComp::MipsJit(mips);
 #else
 		return new MIPSComp::FakeJit(mips);
-#endif
 #endif
 	}
 
diff --git a/Core/MIPS/MIPS.cpp b/Core/MIPS/MIPS.cpp
index 1140b67914fb..0482f21211f6 100644
--- a/Core/MIPS/MIPS.cpp
+++ b/Core/MIPS/MIPS.cpp
@@ -27,6 +27,7 @@
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/MIPSDebugInterface.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"
+#include "Core/MIPS/IR/IRJit.h"
 #include "Core/Reporting.h"
 #include "Core/System.h"
 #include "Core/HLE/sceDisplay.h"
@@ -206,8 +207,10 @@ void MIPSState::Init() {
 	// Initialize the VFPU random number generator with .. something?
 	rng.Init(0x1337);
 
-	if (PSP_CoreParameter().cpuCore == CPU_JIT) {
+	if (PSP_CoreParameter().cpuCore == CPU_CORE_JIT) {
 		MIPSComp::jit = MIPSComp::CreateNativeJit(this);
+	} else if (PSP_CoreParameter().cpuCore == CPU_CORE_IRJIT) {
+		MIPSComp::jit = new MIPSComp::IRJit(this);
 	} else {
 		MIPSComp::jit = nullptr;
 	}
@@ -224,14 +227,23 @@ void MIPSState::UpdateCore(CPUCore desired) {
 
 	PSP_CoreParameter().cpuCore = desired;
 	switch (PSP_CoreParameter().cpuCore) {
-	case CPU_JIT:
+	case CPU_CORE_JIT:
 		INFO_LOG(CPU, "Switching to JIT");
-		if (!MIPSComp::jit) {
-			MIPSComp::jit = MIPSComp::CreateNativeJit(this);
+		if (MIPSComp::jit) {
+			delete MIPSComp::jit;
 		}
+		MIPSComp::jit = MIPSComp::CreateNativeJit(this);
+		break;
+
+	case CPU_CORE_IRJIT:
+		INFO_LOG(CPU, "Switching to IRJIT");
+		if (MIPSComp::jit) {
+			delete MIPSComp::jit;
+		}
+		MIPSComp::jit = new MIPSComp::IRJit(this);
 		break;
 
-	case CPU_INTERPRETER:
+	case CPU_CORE_INTERPRETER:
 		INFO_LOG(CPU, "Switching to interpreter");
 		delete MIPSComp::jit;
 		MIPSComp::jit = 0;
@@ -292,11 +304,12 @@ void MIPSState::SingleStep() {
 // returns 1 if reached ticks limit
 int MIPSState::RunLoopUntil(u64 globalTicks) {
 	switch (PSP_CoreParameter().cpuCore) {
-	case CPU_JIT:
+	case CPU_CORE_JIT:
+	case CPU_CORE_IRJIT:
 		MIPSComp::jit->RunLoopUntil(globalTicks);
 		break;
 
-	case CPU_INTERPRETER:
+	case CPU_CORE_INTERPRETER:
 		return MIPSInterpret_RunUntil(globalTicks);
 	}
 	return 1;
diff --git a/Core/MemMapFunctions.cpp b/Core/MemMapFunctions.cpp
index 93029d65ff5c..d367205ef797 100644
--- a/Core/MemMapFunctions.cpp
+++ b/Core/MemMapFunctions.cpp
@@ -87,7 +87,7 @@ inline void ReadFromHardware(T &var, const u32 address) {
 		var = *((const T*)GetPointerUnchecked(address));
 	} else {
 		// In jit, we only flush PC when bIgnoreBadMemAccess is off.
-		if (g_Config.bJit && g_Config.bIgnoreBadMemAccess) {
+		if (g_Config.iCpuCore != CPU_CORE_INTERPRETER && g_Config.bIgnoreBadMemAccess) {
 			WARN_LOG(MEMMAP, "ReadFromHardware: Invalid address %08x", address);
 		} else {
 			WARN_LOG(MEMMAP, "ReadFromHardware: Invalid address %08x PC %08x LR %08x", address, currentMIPS->pc, currentMIPS->r[MIPS_REG_RA]);
@@ -123,7 +123,7 @@ inline void WriteToHardware(u32 address, const T data) {
 		*(T*)GetPointerUnchecked(address) = data;
 	} else {
 		// In jit, we only flush PC when bIgnoreBadMemAccess is off.
-		if (g_Config.bJit && g_Config.bIgnoreBadMemAccess) {
+		if (g_Config.iCpuCore != CPU_CORE_INTERPRETER && g_Config.bIgnoreBadMemAccess) {
 			WARN_LOG(MEMMAP, "WriteToHardware: Invalid address %08x", address);
 		} else {
 			WARN_LOG(MEMMAP, "WriteToHardware: Invalid address %08x	PC %08x LR %08x", address, currentMIPS->pc, currentMIPS->r[MIPS_REG_RA]);
diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp
index dd89e1ae7d7c..06a3446eb61a 100644
--- a/UI/EmuScreen.cpp
+++ b/UI/EmuScreen.cpp
@@ -101,7 +101,7 @@ void EmuScreen::bootGame(const std::string &filename) {
 	invalid_ = true;
 
 	CoreParameter coreParam;
-	coreParam.cpuCore = g_Config.bJit ? CPU_JIT : CPU_INTERPRETER;
+	coreParam.cpuCore = (CPUCore)g_Config.iCpuCore;
 	coreParam.gpuCore = GPUCORE_GLES;
 	switch (GetGPUBackend()) {
 	case GPUBackend::OPENGL:
@@ -282,7 +282,7 @@ void EmuScreen::sendMessage(const char *message, const char *value) {
 	} else if (!strcmp(message, "clear jit")) {
 		currentMIPS->ClearJitCache();
 		if (PSP_IsInited()) {
-			currentMIPS->UpdateCore(g_Config.bJit ? CPU_JIT : CPU_INTERPRETER);
+			currentMIPS->UpdateCore((CPUCore)g_Config.iCpuCore);
 		}
 	} else if (!strcmp(message, "window minimized")) {
 		if (!strcmp(value, "true")) {
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index 2a5828eaec0a..1ade2a4303f3 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -1059,8 +1059,11 @@ void DeveloperToolsScreen::CreateViews() {
 		}
 	}
 #endif
-	if (canUseJit) {
-		list->Add(new CheckBox(&g_Config.bJit, sy->T("Dynarec", "Dynarec (JIT)")))->OnClick.Handle(this, &DeveloperToolsScreen::OnJitAffectingSetting);
+
+	static const char *cpuCores[] = { "Interpreter", "Dynarec (JIT)", "IRJit" };
+	PopupMultiChoice *core = list->Add(new PopupMultiChoice(&g_Config.iCpuCore, gr->T("CPU Core"), cpuCores, 0, ARRAY_SIZE(cpuCores), sy->GetName(), screenManager()));
+	if (!canUseJit) {
+		core->HideChoice(1);
 	}
 
 	list->Add(new CheckBox(&g_Config.bShowDeveloperMenu, dev->T("Show Developer Menu")));
diff --git a/UI/MiscScreens.cpp b/UI/MiscScreens.cpp
index e6f0cc28e650..c94c9e6df49c 100644
--- a/UI/MiscScreens.cpp
+++ b/UI/MiscScreens.cpp
@@ -133,7 +133,7 @@ void HandleCommonMessages(const char *message, const char *value, ScreenManager
 			MIPSComp::jit->ClearCache();
 		}
 		if (PSP_IsInited()) {
-			currentMIPS->UpdateCore(g_Config.bJit ? CPU_JIT : CPU_INTERPRETER);
+			currentMIPS->UpdateCore((CPUCore)g_Config.iCpuCore);
 		}
 	}
 }
diff --git a/UI/NativeApp.cpp b/UI/NativeApp.cpp
index 786e4e51263c..9ca8045dd89e 100644
--- a/UI/NativeApp.cpp
+++ b/UI/NativeApp.cpp
@@ -392,11 +392,15 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
 				gfxLog = true;
 				break;
 			case 'j':
-				g_Config.bJit = true;
+				g_Config.iCpuCore = CPU_CORE_JIT;
 				g_Config.bSaveSettings = false;
 				break;
 			case 'i':
-				g_Config.bJit = false;
+				g_Config.iCpuCore = CPU_CORE_INTERPRETER;
+				g_Config.bSaveSettings = false;
+				break;
+			case 'r':
+				g_Config.iCpuCore = CPU_CORE_IRJIT;
 				g_Config.bSaveSettings = false;
 				break;
 			case '-':
diff --git a/android/jni/TestRunner.cpp b/android/jni/TestRunner.cpp
index cfc4e0354e36..29ca2b2a0cc4 100644
--- a/android/jni/TestRunner.cpp
+++ b/android/jni/TestRunner.cpp
@@ -69,7 +69,7 @@ void RunTests()
 #endif
 
 	CoreParameter coreParam;
-	coreParam.cpuCore = g_Config.bJit ? CPU_JIT : CPU_INTERPRETER;
+	coreParam.cpuCore = (CPUCore)g_Config.iCpuCore;
 	coreParam.gpuCore = g_Config.bSoftwareRendering ? GPUCORE_SOFTWARE : GPUCORE_GLES;
 	coreParam.enableSound = g_Config.bEnableSound;
 	coreParam.graphicsContext = PSP_CoreParameter().graphicsContext;
diff --git a/headless/Headless.cpp b/headless/Headless.cpp
index 687ac1d875dd..f818da68cab7 100644
--- a/headless/Headless.cpp
+++ b/headless/Headless.cpp
@@ -207,11 +207,11 @@ int main(int argc, const char* argv[])
 #endif
 
 	bool fullLog = false;
-	bool useJit = true;
 	bool autoCompare = false;
 	bool verbose = false;
 	const char *stateToLoad = 0;
 	GPUCore gpuCore = GPUCORE_NULL;
+	CPUCore cpuCore = CPU_CORE_JIT;
 	
 	std::vector<std::string> testFilenames;
 	const char *mountIso = 0;
@@ -236,9 +236,11 @@ int main(int argc, const char* argv[])
 		else if (!strcmp(argv[i], "-l") || !strcmp(argv[i], "--log"))
 			fullLog = true;
 		else if (!strcmp(argv[i], "-i"))
-			useJit = false;
+			cpuCore = CPU_CORE_INTERPRETER;
 		else if (!strcmp(argv[i], "-j"))
-			useJit = true;
+			cpuCore = CPU_CORE_JIT;
+		else if (!strcmp(argv[i], "-ir"))
+			cpuCore = CPU_CORE_IRJIT;
 		else if (!strcmp(argv[i], "-c") || !strcmp(argv[i], "--compare"))
 			autoCompare = true;
 		else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose"))
@@ -311,7 +313,7 @@ int main(int argc, const char* argv[])
 	}
 
 	CoreParameter coreParameter;
-	coreParameter.cpuCore = useJit ? CPU_JIT : CPU_INTERPRETER;
+	coreParameter.cpuCore = cpuCore;
 	coreParameter.gpuCore = glWorking ? gpuCore : GPUCORE_NULL;
 	coreParameter.graphicsContext = graphicsContext;
 	coreParameter.enableSound = false;
diff --git a/unittest/JitHarness.cpp b/unittest/JitHarness.cpp
index b80fb04e6d2a..2467a27c0b5c 100644
--- a/unittest/JitHarness.cpp
+++ b/unittest/JitHarness.cpp
@@ -83,7 +83,7 @@ static void SetupJitHarness() {
 	coreState = CORE_POWERUP;
 	currentMIPS = &mipsr4k;
 	Memory::g_MemorySize = Memory::RAM_NORMAL_SIZE;
-	PSP_CoreParameter().cpuCore = CPU_INTERPRETER;
+	PSP_CoreParameter().cpuCore = CPU_CORE_INTERPRETER;
 	PSP_CoreParameter().unthrottle = true;
 
 	Memory::Init();
@@ -169,7 +169,7 @@ bool TestJit() {
 	double jit_speed = 0.0, interp_speed = 0.0;
 	if (compileSuccess) {
 		interp_speed = ExecCPUTest();
-		mipsr4k.UpdateCore(CPU_JIT);
+		mipsr4k.UpdateCore(CPU_CORE_JIT);
 		jit_speed = ExecCPUTest();
 
 		// Disassemble

From 1a2edc67d07ca63e6659fa38b9675e2c415c609f Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 02:03:11 +0200
Subject: [PATCH 10/77] Add support for float store/load

---
 Core/MIPS/IR/IRCompFPU.cpp | 22 +++++++++++++++++++++-
 Core/MIPS/IR/IRInst.cpp    |  8 ++++++++
 Core/MemMap.h              | 16 ++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index f94bddbaad7a..c4353dd9836b 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -73,7 +73,27 @@ void IRJit::Comp_FPU3op(MIPSOpcode op) {
 }
 
 void IRJit::Comp_FPULS(MIPSOpcode op) {
-	DISABLE;
+	CONDITIONAL_DISABLE;
+	s32 offset = _IMM16;
+	int ft = _FT;
+	MIPSGPReg rs = _RS;
+
+	switch (op >> 26) {
+	case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1
+	{
+		ir.Write(IROp::LoadFloat, ft, rs, ir.AddConstant(offset));
+	}
+	break;
+	case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1
+	{
+		ir.Write(IROp::StoreFloat, ft, rs, ir.AddConstant(offset));
+	}
+	break;
+
+	default:
+		_dbg_assert_msg_(CPU, 0, "Trying to interpret FPULS instruction that can't be interpreted");
+		break;
+	}
 }
 
 void IRJit::Comp_FPUComp(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index cba03ae7957b..439c0b25fd82 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -61,9 +61,11 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load16, "Load16", "GGC" },
 	{ IROp::Load16Ext, "Load16Ext", "GGC" },
 	{ IROp::Load32, "Load32", "GGC" },
+	{ IROp::LoadFloat, "LoadFloat", "FGC" },
 	{ IROp::Store8, "Store8", "GGC" },
 	{ IROp::Store16, "Store16", "GGC" },
 	{ IROp::Store32, "Store32", "GGC" },
+	{ IROp::StoreFloat, "StoreFloat", "FGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
@@ -176,6 +178,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::Load32:
 			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + constPool[inst->src2]);
 			break;
+		case IROp::LoadFloat:
+			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
 
 		case IROp::Store8:
 			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
@@ -186,6 +191,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::Store32:
 			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
+		case IROp::StoreFloat:
+			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
 
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
diff --git a/Core/MemMap.h b/Core/MemMap.h
index c49a41d119cc..c2b4ad3b12a6 100644
--- a/Core/MemMap.h
+++ b/Core/MemMap.h
@@ -186,6 +186,14 @@ inline u32 ReadUnchecked_U32(const u32 address) {
 #endif
 }
 
+inline float ReadUnchecked_Float(const u32 address) {
+#ifdef _ARCH_32
+	return *(float *)(base + (address & MEMVIEW32_MASK));
+#else
+	return *(float *)(base + address);
+#endif
+}
+
 inline u16 ReadUnchecked_U16(const u32 address) {
 #ifdef _ARCH_32
 	return *(u16_le *)(base + (address & MEMVIEW32_MASK));
@@ -210,6 +218,14 @@ inline void WriteUnchecked_U32(u32 data, u32 address) {
 #endif
 }
 
+inline void WriteUnchecked_Float(float data, u32 address) {
+#ifdef _ARCH_32
+	*(float *)(base + (address & MEMVIEW32_MASK)) = data;
+#else
+	*(float *)(base + address) = data;
+#endif
+}
+
 inline void WriteUnchecked_U16(u16 data, u32 address) {
 #ifdef _ARCH_32
 	*(u16_le *)(base + (address & MEMVIEW32_MASK)) = data;

From 7d4774db4c1bdea52d12781586b5c8168fe5ba8c Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 02:08:25 +0200
Subject: [PATCH 11/77] Fix wsbw

---
 Core/MIPS/IR/IRCompALU.cpp | 2 +-
 Core/MIPS/IR/IRInst.cpp    | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index d68150b17842..4a58a0224796 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -292,7 +292,7 @@ void IRJit::Comp_Allegrex2(MIPSOpcode op) {
 		ir.Write(IROp::BSwap16, rd, rt);
 		break;
 	case 0xE0: //wsbw
-		ir.Write(IROp::BSwap16, rd, rt);
+		ir.Write(IROp::BSwap32, rd, rt);
 		break;
 	default:
 		Comp_Generic(op);
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 439c0b25fd82..b1aaa34abc21 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -307,8 +307,11 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 		case IROp::BSwap32:
-			mips->r[inst->dest] = swap32(mips->r[inst->src1]);
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
 			break;
+		}
 
 		case IROp::FAdd:
 			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];

From e750987052b8a5eea356d1535c7cd21181fc606a Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 02:08:54 +0200
Subject: [PATCH 12/77] ir-jit: Fix bug in ext

---
 Core/MIPS/IR/IRCompALU.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 4a58a0224796..fd5944fd0694 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -224,7 +224,6 @@ void IRJit::Comp_ShiftType(MIPSOpcode op) {
 
 void IRJit::Comp_Special3(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
-
 	MIPSGPReg rs = _RS;
 	MIPSGPReg rt = _RT;
 
@@ -237,9 +236,13 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 		return;
 
 	switch (op & 0x3f) {
-	case 0x0: //ext
-		ir.Write(IROp::Shl, rt, rs);
-		ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(mask));
+	case 0x0:
+		if (pos != 0) {
+			ir.Write(IROp::ShrImm, rt, rs, pos);
+			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(mask));
+		} else {
+			ir.Write(IROp::AndConst, rt, rs, ir.AddConstant(mask));
+		}
 		break;
 
 	case 0x4: //ins

From 46e839b2b2f09c60c1e356a4c8ae7997b40fa020 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 02:26:36 +0200
Subject: [PATCH 13/77] ir-jit: Fix bugs in rounding

---
 Core/MIPS/IR/IRInst.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index b1aaa34abc21..1fd990adba9b 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -345,7 +345,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
 			break;
 		case IROp::FRound:
-			mips->r[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
 			break;
 		case IROp::FTrunc:
 		{
@@ -363,10 +363,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 		case IROp::FCeil:
-			mips->r[inst->dest] = (int)ceilf(mips->f[inst->src1]);
+			mips->fs[inst->dest] = (int)ceilf(mips->f[inst->src1]);
 			break;
 		case IROp::FFloor:
-			mips->r[inst->dest] = (int)floorf(mips->f[inst->src1]);
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1]);
 			break;
 
 		case IROp::FCvtSW:

From aae32bd929e9aed2cd4444477960028b0325d3eb Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 02:47:19 +0200
Subject: [PATCH 14/77] ir-jit: Re-enable mult. Fix bvf/bvt

---
 Core/MIPS/IR/IRCompALU.cpp    | 1 -
 Core/MIPS/IR/IRCompBranch.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index fd5944fd0694..46c43ded73a0 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -305,7 +305,6 @@ void IRJit::Comp_Allegrex2(MIPSOpcode op) {
 
 void IRJit::Comp_MulDivType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
-	DISABLE;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
 	MIPSGPReg rd = _RD;
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 2b478f695a01..0cf3e7d8f79f 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -237,7 +237,7 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	
 	u32 notTakenTarget = GetCompilerPC() + (delaySlotIsBranch ? 4 : 8);
 
-	ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(imm3));
+	ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(1 << imm3));
 	FlushAll();
 	ir.Write(ComparisonToExit(cc), ir.AddConstant(notTakenTarget), IRTEMP_0, 0);
 

From 14df39d7c9987f5daf50901869dfd2583f8e567d Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 10:36:37 +0200
Subject: [PATCH 15/77] Fix IRTEMP clash bug. Add more cases to the constant
 propagation pass.

---
 Core/MIPS/IR/IRCompALU.cpp      |  9 ++++--
 Core/MIPS/IR/IRCompBranch.cpp   | 28 ++++++++---------
 Core/MIPS/IR/IRInst.cpp         |  6 ++++
 Core/MIPS/IR/IRInst.h           |  4 +--
 Core/MIPS/IR/IRJit.cpp          |  5 +--
 Core/MIPS/IR/IRPassSimplify.cpp | 56 ++++++++++++++++++++++++++++-----
 Core/MIPS/IR/IRPassSimplify.h   |  2 +-
 Core/MIPS/IR/IRRegCache.cpp     | 11 ++++++-
 Core/MIPS/IR/IRRegCache.h       |  2 ++
 9 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 46c43ded73a0..7f21c2c572d9 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -47,6 +47,7 @@ namespace MIPSComp {
 
 void IRJit::Comp_IType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
+
 	s32 simm = (s32)(s16)(op & 0xFFFF);  // sign extension
 	u32 uimm = op & 0xFFFF;
 	u32 suimm = (u32)(s32)simm;
@@ -236,7 +237,7 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 		return;
 
 	switch (op & 0x3f) {
-	case 0x0:
+	case 0x0: // ext
 		if (pos != 0) {
 			ir.Write(IROp::ShrImm, rt, rs, pos);
 			ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(mask));
@@ -247,17 +248,21 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 
 	case 0x4: //ins
 	{
+		logBlocks = 1;
 		u32 sourcemask = mask >> pos;
 		u32 destmask = ~(sourcemask << pos);
 		ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(sourcemask));
+		if (pos != 0) {
+			ir.Write(IROp::ShlImm, IRTEMP_0, IRTEMP_0, pos);
+		}
 		ir.Write(IROp::AndConst, rt, rt, ir.AddConstant(destmask));
-		ir.Write(IROp::ShlImm, IRTEMP_0, IRTEMP_0, pos);
 		ir.Write(IROp::Or, rt, rt, IRTEMP_0);
 	}
 	break;
 	}
 }
 
+
 void IRJit::Comp_Allegrex(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rt = _RT;
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 0cf3e7d8f79f..a290784904e2 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -73,12 +73,12 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 	MIPSGPReg rhs = rt;
 	if (!delaySlotIsNice) {  // if likely, we don't need this
 		if (rs != 0) {
-			ir.Write(IROp::Mov, IRTEMP_0, rs);
-			lhs = (MIPSGPReg)IRTEMP_0;
+			ir.Write(IROp::Mov, IRTEMP_LHS, rs);
+			lhs = (MIPSGPReg)IRTEMP_LHS;
 		}
 		if (rt != 0) {
-			ir.Write(IROp::Mov, IRTEMP_1, rt);
-			rhs = (MIPSGPReg)IRTEMP_1;
+			ir.Write(IROp::Mov, IRTEMP_RHS, rt);
+			rhs = (MIPSGPReg)IRTEMP_RHS;
 		}
 	}
 
@@ -113,8 +113,8 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 
 	MIPSGPReg lhs = rs;
 	if (!delaySlotIsNice) {  // if likely, we don't need this
-		ir.Write(IROp::Mov, IRTEMP_0, rs);
-		lhs = (MIPSGPReg)IRTEMP_0;
+		ir.Write(IROp::Mov, IRTEMP_LHS, rs);
+		lhs = (MIPSGPReg)IRTEMP_LHS;
 	}
 	if (andLink)
 		ir.WriteSetConstant(MIPS_REG_RA, GetCompilerPC() + 8);
@@ -179,7 +179,7 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	u32 targetAddr = GetCompilerPC() + offset + 4;
 
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
-	ir.Write(IROp::FpCondToReg, IRTEMP_0);
+	ir.Write(IROp::FpCondToReg, IRTEMP_LHS);
 	if (!likely)
 		CompileDelaySlot();
 
@@ -187,7 +187,7 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 
 	FlushAll();
 	// Not taken
-	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), IRTEMP_0, 0);
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(GetCompilerPC() + 8), IRTEMP_LHS, 0);
 	// Taken
 	if (likely)
 		CompileDelaySlot();
@@ -218,8 +218,8 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	u32 targetAddr = GetCompilerPC() + offset + 4;
 
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
-
-	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, VFPU_CTRL_CC);
+	logBlocks = 1;
+	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_LHS, VFPU_CTRL_CC);
 
 	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
 
@@ -237,9 +237,9 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	
 	u32 notTakenTarget = GetCompilerPC() + (delaySlotIsBranch ? 4 : 8);
 
-	ir.Write(IROp::AndConst, IRTEMP_0, IRTEMP_0, ir.AddConstant(1 << imm3));
+	ir.Write(IROp::AndConst, IRTEMP_LHS, IRTEMP_LHS, ir.AddConstant(1 << imm3));
 	FlushAll();
-	ir.Write(ComparisonToExit(cc), ir.AddConstant(notTakenTarget), IRTEMP_0, 0);
+	ir.Write(ComparisonToExit(cc), ir.AddConstant(notTakenTarget), IRTEMP_LHS, 0);
 
 	if (likely)
 		CompileDelaySlot();
@@ -334,8 +334,8 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 		FlushAll();
 	} else {
 		// Bad delay slot.
-		ir.Write(IROp::Mov, IRTEMP_0, rs);
-		destReg = IRTEMP_0;
+		ir.Write(IROp::Mov, IRTEMP_LHS, rs);
+		destReg = IRTEMP_LHS;
 		if (andLink)
 			ir.WriteSetConstant(rd, GetCompilerPC() + 8);
 		CompileDelaySlot();
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 1fd990adba9b..1a184e00eb4f 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -478,6 +478,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		default:
 			Crash();
 		}
+#ifdef _DEBUG
+		if (mips->r[0] != 0)
+			Crash();
+#endif
 		inst++;
 	}
 
@@ -529,6 +533,8 @@ const char *GetGPRName(int r) {
 	switch (r) {
 	case IRTEMP_0: return "irtemp0";
 	case IRTEMP_1: return "irtemp1";
+	case IRTEMP_LHS: return "irtemp_lhs";
+	case IRTEMP_RHS: return "irtemp_rhs";
 	default: return "(unk)";
 	}
 }
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index e044825f1ccb..062d5189abc5 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -204,8 +204,8 @@ inline IROp ComparisonToExit(IRComparison comp) {
 enum {
 	IRTEMP_0 = 192,
 	IRTEMP_1,
-	IRTEMP_2,
-	IRTEMP_3,
+	IRTEMP_LHS,  // Reserved for use in branches
+	IRTEMP_RHS,  // Reserved for use in branches
 
 	// Hacky way to get to other state
 	IRREG_LO = 226,  // offset of lo in MIPSState / 4
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 0d8fca504b14..09f6acbe614e 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -269,7 +269,8 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 
 	IRWriter *code = &ir;
 	if (true) {
-		PropagateConstants(ir, simplified);
+		if (PropagateConstants(ir, simplified))
+			logBlocks = 1;
 		code = &simplified;
 	}
 
@@ -362,7 +363,7 @@ void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
 	} else {
 		ERROR_LOG(HLE, "Replacement function %s has neither jit nor regular impl", entry->name);
 	}
-}
+} 
 
 void IRJit::Comp_Generic(MIPSOpcode op) {
 	FlushAll();
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 38141951de10..d5b943c23c1a 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -26,6 +26,12 @@ u32 Evaluate(u32 a, u32 b, IROp op) {
 	case IROp::And: case IROp::AndConst: return a & b;
 	case IROp::Or: case IROp::OrConst: return a | b;
 	case IROp::Xor: case IROp::XorConst: return a ^ b;
+	case IROp::Shr: case IROp::ShrImm: return a >> b;
+	case IROp::Sar: case IROp::SarImm: return (s32)a >> b;
+	case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b));
+	case IROp::Shl: case IROp::ShlImm: return a << b;
+	case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b);
+	case IROp::SltU: case IROp::SltUConst: return (a < b);
 	default:
 		return -1;
 	}
@@ -38,16 +44,19 @@ IROp ArithToArithConst(IROp op) {
 	case IROp::And: return IROp::AndConst;
 	case IROp::Or: return IROp::OrConst;
 	case IROp::Xor: return IROp::XorConst;
+	case IROp::Slt: return IROp::SltConst;
+	case IROp::SltU: return IROp::SltUConst;
 	default:
 		return (IROp)-1;
 	}
 }
 
 
-void PropagateConstants(const IRWriter &in, IRWriter &out) {
+bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	IRRegCache gpr(&out);
 
 	const u32 *constants = in.GetConstants().data();
+	bool logBlocks = false;
 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
 		IRInst inst = in.GetInstructions()[i];
 		bool symmetric = true;
@@ -57,6 +66,8 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 
 		case IROp::Sub:
+		case IROp::Slt:
+		case IROp::SltU:
 			symmetric = false;  // fallthrough
 		case IROp::Add:
 		case IROp::And:
@@ -67,7 +78,8 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 			} else if (gpr.IsImm(inst.src2) && inst.src1 != inst.src2 && inst.dest != inst.src2) {
 				gpr.MapDirtyIn(inst.dest, inst.src1);
 				if (gpr.GetImm(inst.src2) == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
-					out.Write(IROp::Mov, inst.dest, inst.src1);
+					if (inst.dest != inst.src1)
+						out.Write(IROp::Mov, inst.dest, inst.src1);
 				} else {
 					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(gpr.GetImm(inst.src2)));
 				}
@@ -85,6 +97,8 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::AndConst:
 		case IROp::OrConst:
 		case IROp::XorConst:
+		case IROp::SltConst:
+		case IROp::SltUConst:
 			if (gpr.IsImm(inst.src1)) {
 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), constants[inst.src2], inst.op));
 			} else {
@@ -93,6 +107,18 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::ShlImm:
+		case IROp::ShrImm:
+		case IROp::RorImm:
+		case IROp::SarImm:
+			if (gpr.IsImm(inst.src1)) {
+				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op));
+			} else {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
+
 		case IROp::Mov:
 			if (inst.src1 == inst.src2) {
 				// Nop
@@ -107,18 +133,33 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::Store8:
 		case IROp::Store16:
 		case IROp::Store32:
-			// Just pass through, no excessive flushing
-			gpr.MapInIn(inst.dest, inst.src1);
-			goto doDefault;
+			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
+				gpr.MapIn(inst.dest);
+				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
+			} else {
+				// Just pass through, no excessive flushing
+				gpr.MapInIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
 
 		case IROp::Load8:
 		case IROp::Load8Ext:
 		case IROp::Load16:
 		case IROp::Load16Ext:
 		case IROp::Load32:
-			gpr.MapDirtyIn(inst.dest, inst.src1);
-			goto doDefault;
+			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest && inst.src2 != inst.dest) {
+				gpr.MapDirty(inst.dest);
+				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
+				logBlocks = true;
+			} else {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
 
+		case IROp::Syscall:
+		case IROp::Interpret:
 		case IROp::ExitToConst:
 		case IROp::ExitToReg:
 		case IROp::ExitToConstIfEq:
@@ -155,4 +196,5 @@ void PropagateConstants(const IRWriter &in, IRWriter &out) {
 		}
 		}
 	}
+	return logBlocks;
 }
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index b5d0af1e95d1..5a57be1cfae4 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -6,4 +6,4 @@
 void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);
 
 
-void PropagateConstants(const IRWriter &in, IRWriter &out);
\ No newline at end of file
+bool PropagateConstants(const IRWriter &in, IRWriter &out);
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index f1c020139579..c7e11aa6d9ea 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -26,10 +26,19 @@ IRRegCache::IRRegCache(IRWriter *ir) : ir_(ir) {
 
 void IRRegCache::FlushAll() {
 	for (int i = 0; i < TOTAL_MAPPABLE_MIPSREGS; i++) {
-		Flush(i);
+		if (i < IRTEMP_0)
+			Flush(i);
 	}
 }
 
+void IRRegCache::MapIn(int rd) {
+	Flush(rd);
+}
+
+void IRRegCache::MapDirty(int rd) {
+	Discard(rd);
+}
+
 void IRRegCache::MapInIn(int rs, int rt) {
 	Flush(rs);
 	Flush(rt);
diff --git a/Core/MIPS/IR/IRRegCache.h b/Core/MIPS/IR/IRRegCache.h
index 1d7e78f7a888..68570f50acf5 100644
--- a/Core/MIPS/IR/IRRegCache.h
+++ b/Core/MIPS/IR/IRRegCache.h
@@ -32,6 +32,8 @@ class IRRegCache {
 
 	void FlushAll();
 
+	void MapDirty(int rd);
+	void MapIn(int rd);
 	void MapInIn(int rs, int rt);
 	void MapDirtyIn(int rd, int rs);
 	void MapDirtyInIn(int rd, int rs, int rt);

From 98113edbd4359441df7ce25241bb2ee34ba66670 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 11:29:11 +0200
Subject: [PATCH 16/77] More simplify pass

---
 Core/MIPS/IR/IRJit.cpp          |  9 ++++++---
 Core/MIPS/IR/IRPassSimplify.cpp | 11 +++++------
 Core/MIPS/IR/IRRegCache.cpp     |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 09f6acbe614e..cc8bc8b65cc6 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -256,7 +256,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		js.compilerPC += 4;
 		js.numInstructions++;
 
-		if (ir.GetConstants().size() > 128) {
+		if (ir.GetConstants().size() > 64) {
 			// Need to break the block
 			ir.Write(IROp::ExitToConst, ir.AddConstant(js.compilerPC));
 			js.compiling = false;
@@ -272,6 +272,9 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		if (PropagateConstants(ir, simplified))
 			logBlocks = 1;
 		code = &simplified;
+		// Some blocks in tekken generate curious numbers of constants after propagation.
+		//if (ir.GetConstants().size() >= 64)
+		//	logBlocks = 1;
 	}
 
 	b->SetInstructions(code->GetInstructions(), code->GetConstants());
@@ -287,7 +290,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	}
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
-		ILOG("=============== Original IR (%d instructions) ===============", (int)ir.GetInstructions().size());
+		ILOG("=============== Original IR (%d instructions, %d const) ===============", (int)ir.GetInstructions().size(), (int)ir.GetConstants().size());
 		for (int i = 0; i < ir.GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
@@ -297,7 +300,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	}
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
-		ILOG("=============== IR (%d instructions) ===============", (int)code->GetInstructions().size());
+		ILOG("=============== IR (%d instructions, %d const) ===============", (int)code->GetInstructions().size(), (int)code->GetConstants().size());
 		for (int i = 0; i < code->GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index d5b943c23c1a..5eb1ea800107 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -62,7 +62,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		bool symmetric = true;
 		switch (inst.op) {
 		case IROp::SetConst:
-			gpr.SetImm((MIPSGPReg)inst.dest, constants[inst.src1]);
+			gpr.SetImm(inst.dest, constants[inst.src1]);
 			break;
 
 		case IROp::Sub:
@@ -83,7 +83,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 				} else {
 					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(gpr.GetImm(inst.src2)));
 				}
-			} else if (gpr.IsImm(inst.src1) && inst.src1 != inst.src2 && inst.dest != inst.src2 && symmetric) {
+			} else if (symmetric && gpr.IsImm(inst.src1) && inst.src1 != inst.src2 && inst.dest != inst.src2) {
 				gpr.MapDirtyIn(inst.dest, inst.src2);
 				out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(gpr.GetImm(inst.src1)));
 			} else {
@@ -91,7 +91,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 				goto doDefault;
 			}
 			break;
-		
+
 		case IROp::AddConst:
 		case IROp::SubConst:
 		case IROp::AndConst:
@@ -120,7 +120,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 
 		case IROp::Mov:
-			if (inst.src1 == inst.src2) {
+			if (inst.dest == inst.src1) {
 				// Nop
 			} else if (gpr.IsImm(inst.src1)) {
 				gpr.SetImm(inst.dest, gpr.GetImm(inst.src1));
@@ -148,10 +148,9 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::Load16:
 		case IROp::Load16Ext:
 		case IROp::Load32:
-			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest && inst.src2 != inst.dest) {
+			if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) {
 				gpr.MapDirty(inst.dest);
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
-				logBlocks = true;
 			} else {
 				gpr.MapDirtyIn(inst.dest, inst.src1);
 				goto doDefault;
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index c7e11aa6d9ea..09aeeb9c9026 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -26,7 +26,7 @@ IRRegCache::IRRegCache(IRWriter *ir) : ir_(ir) {
 
 void IRRegCache::FlushAll() {
 	for (int i = 0; i < TOTAL_MAPPABLE_MIPSREGS; i++) {
-		if (i < IRTEMP_0)
+		//if (i < IRTEMP_0)
 			Flush(i);
 	}
 }

From 3eb5480adeec8896e57f938c8cd667eea65829a6 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 13:32:22 +0200
Subject: [PATCH 17/77] Initial VFPU

---
 Core/MIPS/IR/IRCompVFPU.cpp | 66 +++++++++++++++++++++++++++++++++++--
 Core/MIPS/IR/IRInst.cpp     | 53 ++++++++++++++++++++++++++++-
 Core/MIPS/IR/IRInst.h       |  7 +++-
 3 files changed, 122 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index d7b807fe6347..1f2623ac67e3 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -181,7 +181,38 @@ namespace MIPSComp {
 	}
 
 	void IRJit::Comp_SVQ(MIPSOpcode op) {
-		DISABLE;
+		int imm = (signed short)(op & 0xFFFC);
+		int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
+		MIPSGPReg rs = _RS;
+
+		u8 vregs[4];
+		GetVectorRegs(vregs, V_Quad, vt);
+
+		switch (op >> 26) {
+		case 54: //lv.q
+		{
+			// TODO: Add vector load/store instruction to the IR
+			ir.Write(IROp::LoadFloatV, vregs[0], rs, ir.AddConstant(imm));
+			ir.Write(IROp::LoadFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+			ir.Write(IROp::LoadFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+			ir.Write(IROp::LoadFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+		}
+		break;
+
+		case 62: //sv.q
+		{
+			// CC might be set by slow path below, so load regs first.
+			ir.Write(IROp::StoreFloatV, vregs[0], rs, ir.AddConstant(imm));
+			ir.Write(IROp::StoreFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+			ir.Write(IROp::StoreFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+			ir.Write(IROp::StoreFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+		}
+		break;
+
+		default:
+			DISABLE;
+			break;
+		}
 	}
 
 	void IRJit::Comp_VVectorInit(MIPSOpcode op) {
@@ -215,6 +246,11 @@ namespace MIPSComp {
 	}
 
 	void IRJit::Comp_VV2Op(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
+		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
+		if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
+			return;
+		}
 		DISABLE;
 	}
 
@@ -231,7 +267,33 @@ namespace MIPSComp {
 	}
 
 	void IRJit::Comp_Mftv(MIPSOpcode op) {
-		DISABLE;
+		int imm = op & 0xFF;
+		MIPSGPReg rt = _RT;
+		switch ((op >> 21) & 0x1f) {
+		case 3: //mfv / mfvc
+						// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
+			if (rt != 0) {
+				if (imm < 128) {  //R(rt) = VI(imm);
+					ir.Write(IROp::VMovToGPR, rt, imm);
+					logBlocks = 1;
+				} else {
+					DISABLE;
+				}
+			}
+			break;
+
+		case 7: // mtv
+			if (imm < 128) {
+				ir.Write(IROp::VMovFromGPR, imm, rt);
+				logBlocks = 1;
+			} else {
+				DISABLE;
+			}
+			break;
+
+		default:
+			DISABLE;
+		}
 	}
 
 	void IRJit::Comp_Vmfvc(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 1a184e00eb4f..45902a4485c4 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -10,6 +10,8 @@
 
 static const IRMeta irMeta[] = {
 	{ IROp::SetConst, "SetConst", "GC" },
+	{ IROp::SetConstF, "SetConstF", "FC" },
+	{ IROp::SetConstV, "SetConstV", "VC" },
 	{ IROp::Mov, "Mov", "GG" },
 	{ IROp::Add, "Add", "GGG" },
 	{ IROp::Sub, "Sub", "GGG" },
@@ -62,10 +64,12 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load16Ext, "Load16Ext", "GGC" },
 	{ IROp::Load32, "Load32", "GGC" },
 	{ IROp::LoadFloat, "LoadFloat", "FGC" },
+	{ IROp::LoadFloatV, "LoadFloatV", "VGC" },
 	{ IROp::Store8, "Store8", "GGC" },
 	{ IROp::Store16, "Store16", "GGC" },
 	{ IROp::Store32, "Store32", "GGC" },
 	{ IROp::StoreFloat, "StoreFloat", "FGC" },
+	{ IROp::StoreFloatV, "StoreFloatV", "VGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
@@ -82,6 +86,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::FCvtSW, "FCvtSW", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
+	{ IROp::VMovFromGPR, "VMovFromGPR", "VG" },
+	{ IROp::VMovToGPR, "VMovToGPR", "GV" },
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
@@ -117,6 +123,12 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::SetConst:
 			mips->r[inst->dest] = constPool[inst->src1];
 			break;
+		case IROp::SetConstF:
+			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
+			break;
+		case IROp::SetConstV:
+			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
+			break;
 		case IROp::Add:
 			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
 			break;
@@ -181,6 +193,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::LoadFloat:
 			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
 			break;
+		case IROp::LoadFloatV:
+			mips->v[voffset[inst->dest]] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
 
 		case IROp::Store8:
 			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
@@ -194,6 +209,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::StoreFloat:
 			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
+		case IROp::StoreFloatV:
+			Memory::WriteUnchecked_Float(mips->v[voffset[inst->src3]], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
 
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
@@ -389,6 +407,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			}
 			break; //cvt.w.s
 		}
+
 		case IROp::ZeroFpCond:
 			mips->fpcond = 0;
 			break;
@@ -400,6 +419,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
 			break;
 
+		case IROp::VMovFromGPR:
+			memcpy(&mips->v[voffset[inst->dest]], &mips->r[inst->src1], 4);
+			break;
+		case IROp::VMovToGPR:
+			memcpy(&mips->r[inst->dest], &mips->v[voffset[inst->src1]], 4);
+			break;
+
 		case IROp::ExitToConst:
 			return constPool[inst->dest];
 
@@ -540,12 +566,31 @@ const char *GetGPRName(int r) {
 }
 
 void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *constPool) {
+	static const char *vfpuCtrlNames[VFPU_CTRL_MAX] = {
+		"SPFX",
+		"TPFX",
+		"DPFX",
+		"CC",
+		"INF4",
+		"RSV5",
+		"RSV6",
+		"REV",
+		"RCX0",
+		"RCX1",
+		"RCX2",
+		"RCX3",
+		"RCX4",
+		"RCX5",
+		"RCX6",
+		"RCX7",
+	};
+
 	switch (type) {
 	case 'G':
 		snprintf(buf, bufSize, "%s", GetGPRName(param));
 		break;
 	case 'F':
-		snprintf(buf, bufSize, "r%d", param);
+		snprintf(buf, bufSize, "f%d", param);
 		break;
 	case 'C':
 		snprintf(buf, bufSize, "%08x", constPool[param]);
@@ -553,6 +598,12 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'I':
 		snprintf(buf, bufSize, "%02x", param);
 		break;
+	case 'V':
+		snprintf(buf, bufSize, "v%d", param);
+		break;
+	case 'T':
+		snprintf(buf, bufSize, "%s", vfpuCtrlNames[param]);
+		break;
 	case '_':
 	case '\0':
 		buf[0] = 0;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 062d5189abc5..b19651e3d8fb 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -17,7 +17,8 @@
 
 enum class IROp : u8 {
 	SetConst,
-	FSetConst,
+	SetConstF,
+	SetConstV,
 
 	Mov,
 
@@ -88,11 +89,13 @@ enum class IROp : u8 {
 	Load16Ext,
 	Load32,
 	LoadFloat,
+	LoadFloatV,
 
 	Store8,
 	Store16,
 	Store32,
 	StoreFloat,
+	StoreFloatV,
 
 	Ext8to32,
 	Ext16to32,
@@ -136,6 +139,8 @@ enum class IROp : u8 {
 	UpdateRoundingMode,
 
 	SetCtrlVFPU,
+	VMovFromGPR,
+	VMovToGPR,
 
 	// Fake/System instructions
 	Interpret,

From 492ea5fac43f015eade155861bb9c248dff1d761 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 21:38:03 +0200
Subject: [PATCH 18/77] Address a bunch of comments, thanks for the review

---
 Core/MIPS/ARM64/Arm64Jit.cpp      |  2 --
 Core/MIPS/IR/IRCompALU.cpp        |  3 ---
 Core/MIPS/IR/IRCompBranch.cpp     | 30 +++++++++++++++++++-----------
 Core/MIPS/IR/IRCompFPU.cpp        | 19 +++++++------------
 Core/MIPS/IR/IRInst.h             | 10 ----------
 Core/MIPS/IR/IRJit.cpp            | 25 +++----------------------
 Core/MIPS/IR/IRJit.h              |  6 +++---
 Core/MIPS/JitCommon/JitCommon.cpp |  2 +-
 Core/MIPS/x86/Asm.cpp             |  2 +-
 Core/MemMapFunctions.cpp          |  2 +-
 10 files changed, 35 insertions(+), 66 deletions(-)

diff --git a/Core/MIPS/ARM64/Arm64Jit.cpp b/Core/MIPS/ARM64/Arm64Jit.cpp
index 427126e26e6f..83c7ec6b9720 100644
--- a/Core/MIPS/ARM64/Arm64Jit.cpp
+++ b/Core/MIPS/ARM64/Arm64Jit.cpp
@@ -284,8 +284,6 @@ const u8 *Arm64Jit::DoJit(u32 em_address, JitBlock *b) {
 	gpr.Start(analysis);
 	fpr.Start(analysis);
 
-	int partialFlushOffset = 0;
-
 	js.numInstructions = 0;
 	while (js.compiling) {
 		gpr.SetCompilerPC(GetCompilerPC());  // Let it know for log messages
diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 7f21c2c572d9..82053dc63fa3 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -192,8 +192,6 @@ void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpConst) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
-	// Not sure if ARM64 wraps like this so let's do it for it.  (TODO: According to the ARM ARM, it will indeed mask for us so this is not necessary)
-	// ANDI2R(SCRATCH1, gpr.R(rs), 0x1F, INVALID_REG);
 	ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
 	ir.Write(shiftOp, rd, rt, IRTEMP_0);
 }
@@ -248,7 +246,6 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 
 	case 0x4: //ins
 	{
-		logBlocks = 1;
 		u32 sourcemask = mask >> pos;
 		u32 destmask = ~(sourcemask << pos);
 		ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(sourcemask));
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index a290784904e2..e2d6c99c8523 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -53,8 +53,7 @@ namespace MIPSComp
 {
 	using namespace Arm64Gen;
 
-void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
-{
+void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in RSRTComp delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -67,11 +66,12 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely)
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs);
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	MIPSGPReg lhs = rs;
 	MIPSGPReg rhs = rt;
-	if (!delaySlotIsNice) {  // if likely, we don't need this
+	if (!delaySlotIsNice && !likely) {  // if likely, we don't need this
 		if (rs != 0) {
 			ir.Write(IROp::Mov, IRTEMP_LHS, rs);
 			lhs = (MIPSGPReg)IRTEMP_LHS;
@@ -109,7 +109,8 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	MIPSGPReg lhs = rs;
 	if (!delaySlotIsNice) {  // if likely, we don't need this
@@ -136,13 +137,13 @@ void IRJit::Comp_RelBranch(MIPSOpcode op) {
 	// The CC flags here should be opposite of the actual branch becuase they skip the branching action.
 	switch (op >> 26) {
 	case 4: BranchRSRTComp(op, IRComparison::NotEqual, false); break;//beq
-	case 5: BranchRSRTComp(op, IRComparison::Equal,  false); break;//bne
+	case 5: BranchRSRTComp(op, IRComparison::Equal, false); break;//bne
 
 	case 6: BranchRSZeroComp(op, IRComparison::Greater, false, false); break;//blez
 	case 7: BranchRSZeroComp(op, IRComparison::LessEqual, false, false); break;//bgtz
 
 	case 20: BranchRSRTComp(op, IRComparison::NotEqual, true); break;//beql
-	case 21: BranchRSRTComp(op, IRComparison::Equal,  true); break;//bnel
+	case 21: BranchRSRTComp(op, IRComparison::Equal, true); break;//bnel
 
 	case 22: BranchRSZeroComp(op, IRComparison::Greater, false, true); break;//blezl
 	case 23: BranchRSZeroComp(op, IRComparison::LessEqual, false, true); break;//bgtzl
@@ -183,7 +184,8 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (!likely)
 		CompileDelaySlot();
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	FlushAll();
 	// Not taken
@@ -221,7 +223,8 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	logBlocks = 1;
 	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_LHS, VFPU_CTRL_CC);
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	// Sometimes there's a VFPU branch in a delay slot (Disgaea 2: Dark Hero Days, Zettai Hero Project, La Pucelle)
 	// The behavior is undefined - the CPU may take the second branch even if the first one passes.
@@ -268,7 +271,8 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 	u32 off = _IMM26 << 2;
 	u32 targetAddr = (GetCompilerPC() & 0xF0000000) | off;
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	// Might be a stubbed address or something?
 	if (!Memory::IsValidAddress(targetAddr)) {
@@ -316,7 +320,8 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 	if (andLink && rs == rd)
 		delaySlotIsNice = false;
 
-	ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 
 	int destReg;
 	if (IsSyscall(delaySlotOp)) {
@@ -363,6 +368,9 @@ void IRJit::Comp_Syscall(MIPSOpcode op) {
 	RestoreRoundingMode();
 	js.downcountAmount = -offset;
 
+	int dcAmount = js.downcountAmount + 1;
+	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+
 	FlushAll();
 
 	ir.Write(IROp::Syscall, 0, ir.AddConstant(op.encoding));
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index c4353dd9836b..b0ff42cf261c 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -80,15 +80,12 @@ void IRJit::Comp_FPULS(MIPSOpcode op) {
 
 	switch (op >> 26) {
 	case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1
-	{
 		ir.Write(IROp::LoadFloat, ft, rs, ir.AddConstant(offset));
-	}
-	break;
+		break;
+
 	case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1
-	{
 		ir.Write(IROp::StoreFloat, ft, rs, ir.AddConstant(offset));
-	}
-	break;
+		break;
 
 	default:
 		_dbg_assert_msg_(CPU, 0, "Trying to interpret FPULS instruction that can't be interpreted");
@@ -97,7 +94,7 @@ void IRJit::Comp_FPULS(MIPSOpcode op) {
 }
 
 void IRJit::Comp_FPUComp(MIPSOpcode op) {
-	DISABLE;
+	DISABLE;  // IROps not yet implemented
 
 	int opc = op & 0xF;
 	if (opc >= 8) opc -= 8; // alias
@@ -195,8 +192,7 @@ void IRJit::Comp_FPU2op(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_mxc1(MIPSOpcode op)
-{
+void IRJit::Comp_mxc1(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	int fs = _FS;
@@ -215,9 +211,8 @@ void IRJit::Comp_mxc1(MIPSOpcode op)
 			return;
 		}
 		if (fs == 31) {
-			DISABLE;
-		}
-		else if (fs == 0) {
+			DISABLE;  // TODO: Add a new op
+		} else if (fs == 0) {
 			ir.Write(IROp::SetConst, rt, ir.AddConstant(MIPSState::FCR0_VALUE));
 		} else {
 			// Unsupported regs are always 0.
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index b19651e3d8fb..a25996590607 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -219,16 +219,6 @@ enum {
 	IRREG_FPCOND = 229
 };
 
-enum class IRParam {
-	Ignore = '_',
-	UImm8 = 'U',
-	Const = 'C',
-	GPR = 'G',
-	FPR = 'F',
-	VPR = 'V',
-	VCtrl = 'T',
-};
-
 struct IRMeta {
 	IROp op;
 	const char *name;
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index cc8bc8b65cc6..b9522c6b88fd 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -39,8 +39,7 @@
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 
-namespace MIPSComp
-{
+namespace MIPSComp {
 
 IRJit::IRJit(MIPSState *mips) : mips_(mips) { 
 	logBlocks = 0;
@@ -48,8 +47,7 @@ IRJit::IRJit(MIPSState *mips) : mips_(mips) {
 	js.startDefaultPrefix = mips_->HasDefaultPrefix();
 	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 	u32 size = 128 * 1024;
-	blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
-	logBlocks = 12;
+	// blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
 	InitIR();
 }
 
@@ -110,7 +108,7 @@ void IRJit::FlushPrefixV() {
 }
 
 void IRJit::ClearCache() {
-	ILOG("ARM64Jit: Clearing the cache!");
+	ILOG("IRJit: Clearing the cache!");
 	blocks_.Clear();
 }
 
@@ -184,12 +182,6 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 	// ApplyRoundingMode(true);
 	// IR Dispatcher
 	
-	FILE *f;
-	int numBlocks = 0;
-	if (numBlocks) {
-		f = fopen("E:\\blockir.txt", "w");
-	}
-
 	while (true) {
 		// RestoreRoundingMode(true);
 		CoreTiming::Advance();
@@ -203,18 +195,9 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 			u32 data = inst & 0xFFFFFF;
 			if (opcode == (MIPS_EMUHACK_OPCODE >> 24)) {
 				IRBlock *block = blocks_.GetBlock(data);
-				if (numBlocks > 0) {
-					// ILOG("Run block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
-					fprintf(f, "BLOCK : %08x v0: %08x v1: %08x a0: %08x s0: %08x s4: %08x\n", mips_->pc, mips_->r[MIPS_REG_V0], mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0], mips_->r[MIPS_REG_S0], mips_->r[MIPS_REG_S4]);
-					fflush(f);
-					numBlocks--;
-				}
 				mips_->pc = IRInterpret(mips_, block->GetInstructions(), block->GetConstants(), block->GetNumInstructions());
 			} else {
-				if (mips_->pc == 0x0880de94)
-					logBlocks = 10;
 				// RestoreRoundingMode(true);
-				// ILOG("Compile block at %08x : v1=%08x a0=%08x", mips_->pc, mips_->r[MIPS_REG_V1], mips_->r[MIPS_REG_A0]);
 				Compile(mips_->pc);
 				// ApplyRoundingMode(true);
 			}
@@ -246,8 +229,6 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	js.PrefixStart();
 	ir.Clear();
 
-	int partialFlushOffset = 0;
-
 	js.numInstructions = 0;
 	while (js.compiling) {
 		MIPSOpcode inst = Memory::Read_Opcode_JIT(GetCompilerPC());
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 13e9162fa71c..e26f1c24391c 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -265,9 +265,9 @@ class IRJit : public JitInterface {
 
 	IRWriter ir;
 
-	// where to write branch-likely trampolines
-	u32 blTrampolines_;
-	int blTrampolineCount_;
+	// where to write branch-likely trampolines. not used atm
+	// u32 blTrampolines_;
+	// int blTrampolineCount_;
 
 public:
 	// Code pointers
diff --git a/Core/MIPS/JitCommon/JitCommon.cpp b/Core/MIPS/JitCommon/JitCommon.cpp
index e267b9352ed1..0a1acd8229e1 100644
--- a/Core/MIPS/JitCommon/JitCommon.cpp
+++ b/Core/MIPS/JitCommon/JitCommon.cpp
@@ -50,7 +50,7 @@ namespace MIPSComp {
 #if defined(ARM)
 		return new MIPSComp::ArmJit(mips);
 #elif defined(ARM64)
-		return new MIPSComp::IRJit(mips);
+		return new MIPSComp::Arm64Jit(mips);
 #elif defined(_M_IX86) || defined(_M_X64)
 		return new MIPSComp::Jit(mips);
 #elif defined(MIPS)
diff --git a/Core/MIPS/x86/Asm.cpp b/Core/MIPS/x86/Asm.cpp
index 86dfc1d7fb1f..05eda2823d77 100644
--- a/Core/MIPS/x86/Asm.cpp
+++ b/Core/MIPS/x86/Asm.cpp
@@ -40,7 +40,7 @@ namespace MIPSComp
 
 //TODO - make an option
 //#if _DEBUG
-	static bool enableDebug = true;
+static bool enableDebug = false;
 
 //#else
 //		bool enableDebug = false; 
diff --git a/Core/MemMapFunctions.cpp b/Core/MemMapFunctions.cpp
index d367205ef797..112ae7093c7f 100644
--- a/Core/MemMapFunctions.cpp
+++ b/Core/MemMapFunctions.cpp
@@ -87,7 +87,7 @@ inline void ReadFromHardware(T &var, const u32 address) {
 		var = *((const T*)GetPointerUnchecked(address));
 	} else {
 		// In jit, we only flush PC when bIgnoreBadMemAccess is off.
-		if (g_Config.iCpuCore != CPU_CORE_INTERPRETER && g_Config.bIgnoreBadMemAccess) {
+		if (g_Config.iCpuCore == CPU_CORE_JIT && g_Config.bIgnoreBadMemAccess) {
 			WARN_LOG(MEMMAP, "ReadFromHardware: Invalid address %08x", address);
 		} else {
 			WARN_LOG(MEMMAP, "ReadFromHardware: Invalid address %08x PC %08x LR %08x", address, currentMIPS->pc, currentMIPS->r[MIPS_REG_RA]);

From f8659b8e1e59cf570cd0f44a786d004311fdfb06 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 21:56:52 +0200
Subject: [PATCH 19/77] Move the IR interpreter out into its own file. Rename
 it in the UI.  First CMake and Android fixes.

---
 CMakeLists.txt                 |  19 ++
 Core/Core.vcxproj              |   2 +
 Core/Core.vcxproj.filters      |   6 +
 Core/MIPS/ARM/ArmJit.cpp       |  10 +
 Core/MIPS/ARM/ArmJit.h         |   1 +
 Core/MIPS/ARM64/Arm64Jit.cpp   |  10 +
 Core/MIPS/ARM64/Arm64Jit.h     |   1 +
 Core/MIPS/IR/IRInst.cpp        | 407 +-------------------------------
 Core/MIPS/IR/IRInterpreter.cpp | 411 +++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRInterpreter.h   |   8 +
 Core/MIPS/IR/IRJit.cpp         |   1 +
 UI/DevScreens.cpp              |  13 ++
 UI/GameSettingsScreen.cpp      |   2 +-
 android/jni/Android.mk         |  10 +
 14 files changed, 494 insertions(+), 407 deletions(-)
 create mode 100644 Core/MIPS/IR/IRInterpreter.cpp
 create mode 100644 Core/MIPS/IR/IRInterpreter.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d137ba551bd..655d8f7e1306 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1077,6 +1077,25 @@ include_directories(ext/xxhash)
 
 set(CoreExtra)
 set(CoreExtraLibs)
+
+set(CoreExtra ${CoreExtra}
+	Core/MIPS/IR/IRCompALU.cpp
+	Core/MIPS/IR/IRCompBranch.cpp
+	Core/MIPS/IR/IRCompFPU.cpp
+	Core/MIPS/IR/IRCompLoadStore.cpp
+	Core/MIPS/IR/IRCompVFPU.cpp
+	Core/MIPS/IR/IRInst.cpp
+	Core/MIPS/IR/IRInst.h
+	Core/MIPS/IR/IRInterpreter.cpp
+	Core/MIPS/IR/IRInterpreter.h
+	Core/MIPS/IR/IRJit.cpp
+	Core/MIPS/IR/IRJit.h
+	Core/MIPS/IR/IRPassSimplify.cpp
+	Core/MIPS/IR/IRPassSimplify.h
+	Core/MIPS/IR/IRRegCache.cpp
+	Core/MIPS/IR/IRRegCache.h
+	)
+
 if(ARM)
 	set(CoreExtra ${CoreExtra}
 		Core/MIPS/ARM/ArmAsm.cpp
diff --git a/Core/Core.vcxproj b/Core/Core.vcxproj
index e902adf7332d..561d83d2d36e 100644
--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@@ -188,6 +188,7 @@
     <ClCompile Include="MIPS\IR\IRCompLoadStore.cpp" />
     <ClCompile Include="MIPS\IR\IRCompVFPU.cpp" />
     <ClCompile Include="MIPS\IR\IRInst.cpp" />
+    <ClCompile Include="MIPS\IR\IRInterpreter.cpp" />
     <ClCompile Include="MIPS\IR\IRJit.cpp" />
     <ClCompile Include="MIPS\IR\IRPassSimplify.cpp" />
     <ClCompile Include="MIPS\IR\IRRegCache.cpp" />
@@ -518,6 +519,7 @@
     <ClInclude Include="..\ext\udis86\udint.h" />
     <ClInclude Include="..\ext\udis86\udis86.h" />
     <ClInclude Include="MIPS\IR\IRInst.h" />
+    <ClInclude Include="MIPS\IR\IRInterpreter.h" />
     <ClInclude Include="MIPS\IR\IRJit.h" />
     <ClInclude Include="MIPS\IR\IRPassSimplify.h" />
     <ClInclude Include="MIPS\IR\IRRegCache.h" />
diff --git a/Core/Core.vcxproj.filters b/Core/Core.vcxproj.filters
index 5905d62de115..0fc92ec2fad1 100644
--- a/Core/Core.vcxproj.filters
+++ b/Core/Core.vcxproj.filters
@@ -667,6 +667,9 @@
     <ClCompile Include="MIPS\IR\IRPassSimplify.cpp">
       <Filter>MIPS\IR</Filter>
     </ClCompile>
+    <ClCompile Include="MIPS\IR\IRInterpreter.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ELF\ElfReader.h">
@@ -1224,6 +1227,9 @@
     <ClInclude Include="MIPS\IR\IRPassSimplify.h">
       <Filter>MIPS\IR</Filter>
     </ClInclude>
+    <ClInclude Include="MIPS\IR\IRInterpreter.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="CMakeLists.txt" />
diff --git a/Core/MIPS/ARM/ArmJit.cpp b/Core/MIPS/ARM/ArmJit.cpp
index eaf0a9741136..508a6c3b1fa4 100644
--- a/Core/MIPS/ARM/ArmJit.cpp
+++ b/Core/MIPS/ARM/ArmJit.cpp
@@ -677,4 +677,14 @@ void ArmJit::WriteSyscallExit()
 
 void ArmJit::Comp_DoNothing(MIPSOpcode op) { }
 
+MIPSOpcode ArmJit::GetOriginalOp(MIPSOpcode op) {
+	JitBlockCache *bc = GetBlockCache();
+	int block_num = bc->GetBlockNumberFromEmuHackOp(op, true);
+	if (block_num >= 0) {
+		return bc->GetOriginalFirstOp(block_num);
+	} else {
+		return op;
+	}
 }
+
+}  // namespace
diff --git a/Core/MIPS/ARM/ArmJit.h b/Core/MIPS/ARM/ArmJit.h
index 07b70af9688d..efdde624bf36 100644
--- a/Core/MIPS/ARM/ArmJit.h
+++ b/Core/MIPS/ARM/ArmJit.h
@@ -53,6 +53,7 @@ class ArmJit : public ArmGen::ARMXCodeBlock, public JitInterface {
 	void Compile(u32 em_address) override;	// Compiles a block at current MIPS PC
 
 	bool DescribeCodePtr(const u8 *ptr, std::string &name) override;
+	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
 
 	void Comp_RunBlock(MIPSOpcode op) override;
 	void Comp_ReplacementFunc(MIPSOpcode op) override;
diff --git a/Core/MIPS/ARM64/Arm64Jit.cpp b/Core/MIPS/ARM64/Arm64Jit.cpp
index 83c7ec6b9720..a3f032ea5aa6 100644
--- a/Core/MIPS/ARM64/Arm64Jit.cpp
+++ b/Core/MIPS/ARM64/Arm64Jit.cpp
@@ -634,4 +634,14 @@ void Arm64Jit::WriteSyscallExit() {
 
 void Arm64Jit::Comp_DoNothing(MIPSOpcode op) { }
 
+MIPSOpcode Arm64Jit::GetOriginalOp(MIPSOpcode op) {
+	JitBlockCache *bc = GetBlockCache();
+	int block_num = bc->GetBlockNumberFromEmuHackOp(op, true);
+	if (block_num >= 0) {
+		return bc->GetOriginalFirstOp(block_num);
+	} else {
+		return op;
+	}
 }
+
+}  // namespace
diff --git a/Core/MIPS/ARM64/Arm64Jit.h b/Core/MIPS/ARM64/Arm64Jit.h
index e1c9cf5dec45..e341df3e7989 100644
--- a/Core/MIPS/ARM64/Arm64Jit.h
+++ b/Core/MIPS/ARM64/Arm64Jit.h
@@ -54,6 +54,7 @@ class Arm64Jit : public Arm64Gen::ARM64CodeBlock, public JitInterface {
 	const u8 *DoJit(u32 em_address, JitBlock *b);
 
 	bool DescribeCodePtr(const u8 *ptr, std::string &name) override;
+	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
 
 	void Comp_RunBlock(MIPSOpcode op) override;
 	void Comp_ReplacementFunc(MIPSOpcode op) override;
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 45902a4485c4..66ee7561f2cb 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -1,12 +1,7 @@
+#include "Common/CommonFuncs.h"
 #include "Core/MIPS/IR/IRInst.h"
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/MIPSDebugInterface.h"
-#include "Core/MIPS/MIPSTables.h"
-#include "Core/MemMap.h"
-#include "Core/HLE/HLE.h"
-#include "Core/HLE/ReplaceTables.h"
-
-#include "math/math_util.h"
 
 static const IRMeta irMeta[] = {
 	{ IROp::SetConst, "SetConst", "GC" },
@@ -116,406 +111,6 @@ void InitIR() {
 	}
 }
 
-u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
-	const IRInst *end = inst + count;
-	while (inst != end) {
-		switch (inst->op) {
-		case IROp::SetConst:
-			mips->r[inst->dest] = constPool[inst->src1];
-			break;
-		case IROp::SetConstF:
-			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
-			break;
-		case IROp::SetConstV:
-			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
-			break;
-		case IROp::Add:
-			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
-			break;
-		case IROp::Sub:
-			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
-			break;
-		case IROp::And:
-			mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];
-			break;
-		case IROp::Or:
-			mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];
-			break;
-		case IROp::Xor:
-			mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];
-			break;
-		case IROp::Mov:
-			mips->r[inst->dest] = mips->r[inst->src1];
-			break;
-		case IROp::AddConst:
-			mips->r[inst->dest] = mips->r[inst->src1] + constPool[inst->src2];
-			break;
-		case IROp::SubConst:
-			mips->r[inst->dest] = mips->r[inst->src1] - constPool[inst->src2];
-			break;
-		case IROp::AndConst:
-			mips->r[inst->dest] = mips->r[inst->src1] & constPool[inst->src2];
-			break;
-		case IROp::OrConst:
-			mips->r[inst->dest] = mips->r[inst->src1] | constPool[inst->src2];
-			break;
-		case IROp::XorConst:
-			mips->r[inst->dest] = mips->r[inst->src1] ^ constPool[inst->src2];
-			break;
-		case IROp::Neg:
-			mips->r[inst->dest] = -(s32)mips->r[inst->src1];
-			break;
-		case IROp::Not:
-			mips->r[inst->dest] = ~mips->r[inst->src1];
-			break;
-		case IROp::Ext8to32:
-			mips->r[inst->dest] = (s32)(s8)mips->r[inst->src1];
-			break;
-		case IROp::Ext16to32:
-			mips->r[inst->dest] = (s32)(s16)mips->r[inst->src1];
-			break;
-
-		case IROp::Load8:
-			mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Load8Ext:
-			mips->r[inst->dest] = (s32)(s8)Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Load16:
-			mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Load16Ext:
-			mips->r[inst->dest] = (s32)(s16)Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Load32:
-			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::LoadFloat:
-			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::LoadFloatV:
-			mips->v[voffset[inst->dest]] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-
-		case IROp::Store8:
-			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Store16:
-			Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::Store32:
-			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::StoreFloat:
-			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-		case IROp::StoreFloatV:
-			Memory::WriteUnchecked_Float(mips->v[voffset[inst->src3]], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
-
-		case IROp::ShlImm:
-			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
-			break;
-		case IROp::ShrImm:
-			mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;
-			break;
-		case IROp::SarImm:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;
-			break;
-		case IROp::RorImm:
-		{
-			u32 x = mips->r[inst->src1];
-			int sa = inst->src2;
-			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
-		}
-			break;
-
-		case IROp::Shl:
-			mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);
-			break;
-		case IROp::Shr:
-			mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
-			break;
-		case IROp::Sar:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
-			break;
-		case IROp::Ror:
-		{
-			u32 x = mips->r[inst->src1];
-			int sa = mips->r[inst->src2] & 31;
-			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
-		}
-		break;
-
-		case IROp::Clz:
-		{
-			int x = 31;
-			int count = 0;
-			int value = mips->r[inst->src1];
-			while (x >= 0 && !(value & (1 << x))) {
-				count++;
-				x--;
-			}
-			mips->r[inst->dest] = count;
-			break;
-		}
-
-		case IROp::Slt:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
-			break;
-
-		case IROp::SltU:
-			mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];
-			break;
-
-		case IROp::SltConst:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)constPool[inst->src2];
-			break;
-
-		case IROp::SltUConst:
-			mips->r[inst->dest] = mips->r[inst->src1] < constPool[inst->src2];
-			break;
-
-		case IROp::MovZ:
-			if (mips->r[inst->src1] == 0)
-				mips->r[inst->dest] = mips->r[inst->src2];
-			break;
-		case IROp::MovNZ:
-			if (mips->r[inst->src1] != 0)
-				mips->r[inst->dest] = mips->r[inst->src2];
-			break;
-
-		case IROp::Max:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
-			break;
-		case IROp::Min:
-			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
-			break;
-
-		case IROp::MtLo:
-			mips->lo = mips->r[inst->src1];
-			break;
-		case IROp::MtHi:
-			mips->hi = mips->r[inst->src1];
-			break;
-		case IROp::MfLo:
-			mips->r[inst->dest] = mips->lo;
-			break;
-		case IROp::MfHi:
-			mips->r[inst->dest] = mips->hi;
-			break;
-
-		case IROp::Mult:
-		{
-			s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
-			memcpy(&mips->lo, &result, 8);
-			break;
-		}
-		case IROp::MultU:
-		{
-			u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
-			memcpy(&mips->lo, &result, 8);
-			break;
-		}
-
-		case IROp::BSwap16:
-		{
-			u32 x = mips->r[inst->src1];
-			mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
-			break;
-		}
-		case IROp::BSwap32:
-		{
-			u32 x = mips->r[inst->src1];
-			mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
-			break;
-		}
-
-		case IROp::FAdd:
-			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];
-			break;
-		case IROp::FSub:
-			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
-			break;
-		case IROp::FMul:
-			mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
-			break;
-		case IROp::FDiv:
-			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
-			break;
-
-		case IROp::FMov:
-			mips->f[inst->dest] = mips->f[inst->src1];
-			break;
-		case IROp::FAbs:
-			mips->f[inst->dest] = fabsf(mips->f[inst->src1]);
-			break;
-		case IROp::FSqrt:
-			mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);
-			break;
-		case IROp::FNeg:
-			mips->f[inst->dest] = -mips->f[inst->src1];
-			break;
-		case IROp::FpCondToReg:
-			mips->r[inst->dest] = mips->fpcond;
-			break;
-		case IROp::VfpuCtrlToReg:
-			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
-			break;
-		case IROp::FRound:
-			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
-			break;
-		case IROp::FTrunc:
-		{
-			float src = mips->f[inst->src1];
-			if (src >= 0.0f) {
-				mips->fs[inst->dest] = (int)floorf(src);
-				// Overflow, but it was positive.
-				if (mips->fs[inst->dest] == -2147483648LL) {
-					mips->fs[inst->dest] = 2147483647LL;
-				}
-			} else {
-				// Overflow happens to be the right value anyway.
-				mips->fs[inst->dest] = (int)ceilf(src);
-			}
-			break;
-		}
-		case IROp::FCeil:
-			mips->fs[inst->dest] = (int)ceilf(mips->f[inst->src1]);
-			break;
-		case IROp::FFloor:
-			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1]);
-			break;
-
-		case IROp::FCvtSW:
-			mips->f[inst->dest] = (float)mips->fs[inst->src1];
-			break;
-		case IROp::FCvtWS:
-		{
-			float src = mips->f[inst->src1];
-			if (my_isnanorinf(src))
-			{
-				mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;
-				break;
-			}
-			switch (mips->fcr31 & 3)
-			{
-			case 0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;  // RINT_0
-			case 1: mips->fs[inst->dest] = (int)src; break;  // CAST_1
-			case 2: mips->fs[inst->dest] = (int)ceilf(src); break;  // CEIL_2
-			case 3: mips->fs[inst->dest] = (int)floorf(src); break;  // FLOOR_3
-			}
-			break; //cvt.w.s
-		}
-
-		case IROp::ZeroFpCond:
-			mips->fpcond = 0;
-			break;
-
-		case IROp::FMovFromGPR:
-			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
-			break;
-		case IROp::FMovToGPR:
-			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
-			break;
-
-		case IROp::VMovFromGPR:
-			memcpy(&mips->v[voffset[inst->dest]], &mips->r[inst->src1], 4);
-			break;
-		case IROp::VMovToGPR:
-			memcpy(&mips->r[inst->dest], &mips->v[voffset[inst->src1]], 4);
-			break;
-
-		case IROp::ExitToConst:
-			return constPool[inst->dest];
-
-		case IROp::ExitToReg:
-			return mips->r[inst->dest];
-
-		case IROp::ExitToConstIfEq:
-			if (mips->r[inst->src1] == mips->r[inst->src2])
-				return constPool[inst->dest];
-			break;
-		case IROp::ExitToConstIfNeq:
-			if (mips->r[inst->src1] != mips->r[inst->src2])
-				return constPool[inst->dest];
-			break;
-		case IROp::ExitToConstIfGtZ:
-			if ((s32)mips->r[inst->src1] > 0)
-				return constPool[inst->dest];
-			break;
-		case IROp::ExitToConstIfGeZ:
-			if ((s32)mips->r[inst->src1] >= 0)
-				return constPool[inst->dest];
-			break;
-		case IROp::ExitToConstIfLtZ:
-			if ((s32)mips->r[inst->src1] < 0)
-				return constPool[inst->dest];
-			break;
-		case IROp::ExitToConstIfLeZ:
-			if ((s32)mips->r[inst->src1] <= 0)
-				return constPool[inst->dest];
-			break;
-
-		case IROp::Downcount:
-			mips->downcount -= (inst->src1) | ((inst->src2) << 8);
-			break;
-
-		case IROp::SetPC:
-			mips->pc = mips->r[inst->src1];
-			break;
-
-		case IROp::SetPCConst:
-			mips->pc = constPool[inst->src1];
-			break;
-
-		case IROp::Syscall:
-			// SetPC was executed before.
-		{
-			MIPSOpcode op(constPool[inst->src1]);
-			CallSyscall(op);
-			return mips->pc;
-		}
-
-		case IROp::Interpret:  // SLOW fallback. Can be made faster.
-		{
-			MIPSOpcode op(constPool[inst->src1]);
-			MIPSInterpret(op);
-			break;
-		}
-
-		case IROp::CallReplacement:
-		{
-			int funcIndex = constPool[inst->src1];
-			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
-			int cycles = f->replaceFunc();
-			mips->downcount -= cycles;
-			return mips->r[MIPS_REG_RA];
-		}
-
-		case IROp::Break:
-			Crash();
-			break;
-
-		case IROp::SetCtrlVFPU:
-			mips->vfpuCtrl[inst->dest] = constPool[inst->src1];
-			break;
-
-		default:
-			Crash();
-		}
-#ifdef _DEBUG
-		if (mips->r[0] != 0)
-			Crash();
-#endif
-		inst++;
-	}
-
-	// If we got here, the block was badly constructed.
-	Crash();
-	return 0;
-}
-
 void IRWriter::Write(IROp op, u8 dst, u8 src1, u8 src2) {
 	IRInst inst;
 	inst.op = op;
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
new file mode 100644
index 000000000000..0c3c66188c9d
--- /dev/null
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -0,0 +1,411 @@
+#include "Core/MemMap.h"
+#include "Core/HLE/HLE.h"
+#include "Core/HLE/ReplaceTables.h"
+#include "Core/MIPS/MIPSTables.h"
+
+#include "math/math_util.h"
+#include "Common/CommonTypes.h"
+#include "Core/MemMap.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/IR/IRInst.h"
+#include "Core/MIPS/IR/IRInterpreter.h"
+
+u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
+	const IRInst *end = inst + count;
+	while (inst != end) {
+		switch (inst->op) {
+		case IROp::SetConst:
+			mips->r[inst->dest] = constPool[inst->src1];
+			break;
+		case IROp::SetConstF:
+			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
+			break;
+		case IROp::SetConstV:
+			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
+			break;
+		case IROp::Add:
+			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
+			break;
+		case IROp::Sub:
+			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
+			break;
+		case IROp::And:
+			mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];
+			break;
+		case IROp::Or:
+			mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];
+			break;
+		case IROp::Xor:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];
+			break;
+		case IROp::Mov:
+			mips->r[inst->dest] = mips->r[inst->src1];
+			break;
+		case IROp::AddConst:
+			mips->r[inst->dest] = mips->r[inst->src1] + constPool[inst->src2];
+			break;
+		case IROp::SubConst:
+			mips->r[inst->dest] = mips->r[inst->src1] - constPool[inst->src2];
+			break;
+		case IROp::AndConst:
+			mips->r[inst->dest] = mips->r[inst->src1] & constPool[inst->src2];
+			break;
+		case IROp::OrConst:
+			mips->r[inst->dest] = mips->r[inst->src1] | constPool[inst->src2];
+			break;
+		case IROp::XorConst:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ constPool[inst->src2];
+			break;
+		case IROp::Neg:
+			mips->r[inst->dest] = -(s32)mips->r[inst->src1];
+			break;
+		case IROp::Not:
+			mips->r[inst->dest] = ~mips->r[inst->src1];
+			break;
+		case IROp::Ext8to32:
+			mips->r[inst->dest] = (s32)(s8)mips->r[inst->src1];
+			break;
+		case IROp::Ext16to32:
+			mips->r[inst->dest] = (s32)(s16)mips->r[inst->src1];
+			break;
+
+		case IROp::Load8:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load8Ext:
+			mips->r[inst->dest] = (s32)(s8)Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16Ext:
+			mips->r[inst->dest] = (s32)(s16)Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load32:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::LoadFloat:
+			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::LoadFloatV:
+			mips->v[voffset[inst->dest]] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::Store8:
+			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store16:
+			Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store32:
+			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::StoreFloat:
+			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::StoreFloatV:
+			Memory::WriteUnchecked_Float(mips->v[voffset[inst->src3]], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::ShlImm:
+			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
+			break;
+		case IROp::ShrImm:
+			mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;
+			break;
+		case IROp::SarImm:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;
+			break;
+		case IROp::RorImm:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = inst->src2;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+		break;
+
+		case IROp::Shl:
+			mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Shr:
+			mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Sar:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Ror:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = mips->r[inst->src2] & 31;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+		break;
+
+		case IROp::Clz:
+		{
+			int x = 31;
+			int count = 0;
+			int value = mips->r[inst->src1];
+			while (x >= 0 && !(value & (1 << x))) {
+				count++;
+				x--;
+			}
+			mips->r[inst->dest] = count;
+			break;
+		}
+
+		case IROp::Slt:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
+			break;
+
+		case IROp::SltU:
+			mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];
+			break;
+
+		case IROp::SltConst:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)constPool[inst->src2];
+			break;
+
+		case IROp::SltUConst:
+			mips->r[inst->dest] = mips->r[inst->src1] < constPool[inst->src2];
+			break;
+
+		case IROp::MovZ:
+			if (mips->r[inst->src1] == 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+		case IROp::MovNZ:
+			if (mips->r[inst->src1] != 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+
+		case IROp::Max:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+		case IROp::Min:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+
+		case IROp::MtLo:
+			mips->lo = mips->r[inst->src1];
+			break;
+		case IROp::MtHi:
+			mips->hi = mips->r[inst->src1];
+			break;
+		case IROp::MfLo:
+			mips->r[inst->dest] = mips->lo;
+			break;
+		case IROp::MfHi:
+			mips->r[inst->dest] = mips->hi;
+			break;
+
+		case IROp::Mult:
+		{
+			s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+		case IROp::MultU:
+		{
+			u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+
+		case IROp::BSwap16:
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
+			break;
+		}
+		case IROp::BSwap32:
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
+			break;
+		}
+
+		case IROp::FAdd:
+			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];
+			break;
+		case IROp::FSub:
+			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
+			break;
+		case IROp::FMul:
+			mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
+			break;
+		case IROp::FDiv:
+			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
+			break;
+
+		case IROp::FMov:
+			mips->f[inst->dest] = mips->f[inst->src1];
+			break;
+		case IROp::FAbs:
+			mips->f[inst->dest] = fabsf(mips->f[inst->src1]);
+			break;
+		case IROp::FSqrt:
+			mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);
+			break;
+		case IROp::FNeg:
+			mips->f[inst->dest] = -mips->f[inst->src1];
+			break;
+		case IROp::FpCondToReg:
+			mips->r[inst->dest] = mips->fpcond;
+			break;
+		case IROp::VfpuCtrlToReg:
+			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
+			break;
+		case IROp::FRound:
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
+			break;
+		case IROp::FTrunc:
+		{
+			float src = mips->f[inst->src1];
+			if (src >= 0.0f) {
+				mips->fs[inst->dest] = (int)floorf(src);
+				// Overflow, but it was positive.
+				if (mips->fs[inst->dest] == -2147483648LL) {
+					mips->fs[inst->dest] = 2147483647LL;
+				}
+			} else {
+				// Overflow happens to be the right value anyway.
+				mips->fs[inst->dest] = (int)ceilf(src);
+			}
+			break;
+		}
+		case IROp::FCeil:
+			mips->fs[inst->dest] = (int)ceilf(mips->f[inst->src1]);
+			break;
+		case IROp::FFloor:
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1]);
+			break;
+
+		case IROp::FCvtSW:
+			mips->f[inst->dest] = (float)mips->fs[inst->src1];
+			break;
+		case IROp::FCvtWS:
+		{
+			float src = mips->f[inst->src1];
+			if (my_isnanorinf(src))
+			{
+				mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;
+				break;
+			}
+			switch (mips->fcr31 & 3)
+			{
+			case 0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;  // RINT_0
+			case 1: mips->fs[inst->dest] = (int)src; break;  // CAST_1
+			case 2: mips->fs[inst->dest] = (int)ceilf(src); break;  // CEIL_2
+			case 3: mips->fs[inst->dest] = (int)floorf(src); break;  // FLOOR_3
+			}
+			break; //cvt.w.s
+		}
+
+		case IROp::ZeroFpCond:
+			mips->fpcond = 0;
+			break;
+
+		case IROp::FMovFromGPR:
+			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
+			break;
+		case IROp::FMovToGPR:
+			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
+			break;
+
+		case IROp::VMovFromGPR:
+			memcpy(&mips->v[voffset[inst->dest]], &mips->r[inst->src1], 4);
+			break;
+		case IROp::VMovToGPR:
+			memcpy(&mips->r[inst->dest], &mips->v[voffset[inst->src1]], 4);
+			break;
+
+		case IROp::ExitToConst:
+			return constPool[inst->dest];
+
+		case IROp::ExitToReg:
+			return mips->r[inst->dest];
+
+		case IROp::ExitToConstIfEq:
+			if (mips->r[inst->src1] == mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfNeq:
+			if (mips->r[inst->src1] != mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGtZ:
+			if ((s32)mips->r[inst->src1] > 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGeZ:
+			if ((s32)mips->r[inst->src1] >= 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLtZ:
+			if ((s32)mips->r[inst->src1] < 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLeZ:
+			if ((s32)mips->r[inst->src1] <= 0)
+				return constPool[inst->dest];
+			break;
+
+		case IROp::Downcount:
+			mips->downcount -= (inst->src1) | ((inst->src2) << 8);
+			break;
+
+		case IROp::SetPC:
+			mips->pc = mips->r[inst->src1];
+			break;
+
+		case IROp::SetPCConst:
+			mips->pc = constPool[inst->src1];
+			break;
+
+		case IROp::Syscall:
+			// SetPC was executed before.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			CallSyscall(op);
+			return mips->pc;
+		}
+
+		case IROp::Interpret:  // SLOW fallback. Can be made faster.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			MIPSInterpret(op);
+			break;
+		}
+
+		case IROp::CallReplacement:
+		{
+			int funcIndex = constPool[inst->src1];
+			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
+			int cycles = f->replaceFunc();
+			mips->downcount -= cycles;
+			return mips->r[MIPS_REG_RA];
+		}
+
+		case IROp::Break:
+			Crash();
+			break;
+
+		case IROp::SetCtrlVFPU:
+			mips->vfpuCtrl[inst->dest] = constPool[inst->src1];
+			break;
+
+		default:
+			Crash();
+		}
+#ifdef _DEBUG
+		if (mips->r[0] != 0)
+			Crash();
+#endif
+		inst++;
+	}
+
+	// If we got here, the block was badly constructed.
+	Crash();
+	return 0;
+}
diff --git a/Core/MIPS/IR/IRInterpreter.h b/Core/MIPS/IR/IRInterpreter.h
new file mode 100644
index 000000000000..fe9f8e4ba9a5
--- /dev/null
+++ b/Core/MIPS/IR/IRInterpreter.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "Common/CommonTypes.h"
+
+class MIPSState;
+struct IRInst;
+
+u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count);
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index b9522c6b88fd..668da61a362f 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -37,6 +37,7 @@
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRJit.h"
 #include "Core/MIPS/IR/IRPassSimplify.h"
+#include "Core/MIPS/IR/IRInterpreter.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 
 namespace MIPSComp {
diff --git a/UI/DevScreens.cpp b/UI/DevScreens.cpp
index 1b41243b61f7..42712bc111d1 100644
--- a/UI/DevScreens.cpp
+++ b/UI/DevScreens.cpp
@@ -669,6 +669,8 @@ UI::EventReturn JitCompareScreen::OnAddressChange(UI::EventParams &e) {
 		return UI::EVENT_DONE;
 	}
 	JitBlockCache *blockCache = MIPSComp::jit->GetBlockCache();
+	if (!blockCache)
+		return UI::EVENT_DONE;
 	u32 addr;
 	if (blockAddr_->GetText().size() > 8)
 		return UI::EVENT_DONE;
@@ -731,6 +733,9 @@ UI::EventReturn JitCompareScreen::OnBlockAddress(UI::EventParams &e) {
 	}
 
 	JitBlockCache *blockCache = MIPSComp::jit->GetBlockCache();
+	if (!blockCache)
+		return UI::EVENT_DONE;
+
 	if (Memory::IsValidAddress(e.a)) {
 		currentBlock_ = blockCache->GetBlockNumberFromStartAddress(e.a);
 	} else {
@@ -746,6 +751,9 @@ UI::EventReturn JitCompareScreen::OnRandomBlock(UI::EventParams &e) {
 	}
 
 	JitBlockCache *blockCache = MIPSComp::jit->GetBlockCache();
+	if (!blockCache)
+		return UI::EVENT_DONE;
+
 	int numBlocks = blockCache->GetNumBlocks();
 	if (numBlocks > 0) {
 		currentBlock_ = rand() % numBlocks;
@@ -769,6 +777,9 @@ void JitCompareScreen::OnRandomBlock(int flag) {
 		return;
 	}
 	JitBlockCache *blockCache = MIPSComp::jit->GetBlockCache();
+	if (!blockCache)
+		return;
+
 	int numBlocks = blockCache->GetNumBlocks();
 	if (numBlocks > 0) {
 		bool anyWanted = false;
@@ -797,6 +808,8 @@ UI::EventReturn JitCompareScreen::OnCurrentBlock(UI::EventParams &e) {
 		return UI::EVENT_DONE;
 	}
 	JitBlockCache *blockCache = MIPSComp::jit->GetBlockCache();
+	if (!blockCache)
+		return UI::EVENT_DONE;
 	std::vector<int> blockNum;
 	blockCache->GetBlockNumbersFromAddress(currentMIPS->pc, &blockNum);
 	if (blockNum.size() > 0) {
diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp
index 1ade2a4303f3..32c4a1f93be3 100644
--- a/UI/GameSettingsScreen.cpp
+++ b/UI/GameSettingsScreen.cpp
@@ -1060,7 +1060,7 @@ void DeveloperToolsScreen::CreateViews() {
 	}
 #endif
 
-	static const char *cpuCores[] = { "Interpreter", "Dynarec (JIT)", "IRJit" };
+	static const char *cpuCores[] = { "Interpreter", "Dynarec (JIT)", "IR Interpreter" };
 	PopupMultiChoice *core = list->Add(new PopupMultiChoice(&g_Config.iCpuCore, gr->T("CPU Core"), cpuCores, 0, ARRAY_SIZE(cpuCores), sy->GetName(), screenManager()));
 	if (!canUseJit) {
 		core->HideChoice(1);
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 1cda4ed0951a..92a10e800c63 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -157,6 +157,16 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/Core/MIPS/MIPSVFPUUtils.cpp.arm \
   $(SRC)/Core/MIPS/MIPSCodeUtils.cpp.arm \
   $(SRC)/Core/MIPS/MIPSDebugInterface.cpp \
+  $(SRC)/Core/MIPS/IR/IRJit.cpp \
+  $(SRC)/Core/MIPS/IR/IRCompALU.cpp \
+  $(SRC)/Core/MIPS/IR/IRCompBranch.cpp \
+  $(SRC)/Core/MIPS/IR/IRCompFPU.cpp \
+  $(SRC)/Core/MIPS/IR/IRCompLoadStore.cpp \
+  $(SRC)/Core/MIPS/IR/IRCompVFPU.cpp \
+  $(SRC)/Core/MIPS/IR/IRInst.cpp \
+  $(SRC)/Core/MIPS/IR/IRInterpreter.cpp \
+  $(SRC)/Core/MIPS/IR/IRPassSimplify.cpp \
+  $(SRC)/Core/MIPS/IR/IRRegCache.cpp \
   $(SRC)/UI/ui_atlas.cpp \
   $(SRC)/UI/OnScreenDisplay.cpp \
   $(SRC)/ext/libkirk/AES.c \

From 4e52f613f115682f42f57d827bc55ce187083205 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 22:23:51 +0200
Subject: [PATCH 20/77] Additional fixes and buildfixes

---
 Core/MIPS/IR/IRJit.h            | 2 ++
 Core/MIPS/IR/IRPassSimplify.cpp | 4 ++++
 Core/MemMapFunctions.cpp        | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index e26f1c24391c..65da0f8e26a1 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <cstring>
+
 #include "Common/CPUDetect.h"
 #include "Core/MIPS/JitCommon/JitState.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 5eb1ea800107..9110ae55f739 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -60,6 +60,10 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
 		IRInst inst = in.GetInstructions()[i];
 		bool symmetric = true;
+		if (out.GetConstants().size() > 128) {
+			// Avoid causing a constant explosion.
+			goto doDefault;
+		}
 		switch (inst.op) {
 		case IROp::SetConst:
 			gpr.SetImm(inst.dest, constants[inst.src1]);
diff --git a/Core/MemMapFunctions.cpp b/Core/MemMapFunctions.cpp
index 112ae7093c7f..7ffc80dc9a14 100644
--- a/Core/MemMapFunctions.cpp
+++ b/Core/MemMapFunctions.cpp
@@ -123,7 +123,7 @@ inline void WriteToHardware(u32 address, const T data) {
 		*(T*)GetPointerUnchecked(address) = data;
 	} else {
 		// In jit, we only flush PC when bIgnoreBadMemAccess is off.
-		if (g_Config.iCpuCore != CPU_CORE_INTERPRETER && g_Config.bIgnoreBadMemAccess) {
+		if (g_Config.iCpuCore == CPU_CORE_JIT && g_Config.bIgnoreBadMemAccess) {
 			WARN_LOG(MEMMAP, "WriteToHardware: Invalid address %08x", address);
 		} else {
 			WARN_LOG(MEMMAP, "WriteToHardware: Invalid address %08x	PC %08x LR %08x", address, currentMIPS->pc, currentMIPS->r[MIPS_REG_RA]);

From ed0a0378d788006287091f51178aa7e4a1f285ba Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 22:27:20 +0200
Subject: [PATCH 21/77] Another buildfix

---
 Common/CommonFuncs.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/Common/CommonFuncs.h b/Common/CommonFuncs.h
index c26b696afbce..92e15bfc6f73 100644
--- a/Common/CommonFuncs.h
+++ b/Common/CommonFuncs.h
@@ -20,30 +20,27 @@
 #include "base/compat.h"
 #include "CommonTypes.h"
 
-#if defined(IOS) || defined(MIPS)
-#include <signal.h>
-#endif
-
 template <bool> struct CompileTimeAssert;
 template<> struct CompileTimeAssert<true> {};
 
-#ifndef _WIN32
+#if !defined(_WIN32)
 
 #include <unistd.h>
 #include <errno.h>
 
 #if defined(_M_IX86) || defined(_M_X86)
-	#define Crash() {asm ("int $3");}
+#include <signal.h>
+#define Crash() {asm ("int $3");}
 #else
-  #define Crash() {kill(getpid(), SIGINT);}
+#define Crash() {kill(getpid(), SIGINT);}
 #endif
 
 #define ARRAYSIZE(A) (sizeof(A)/sizeof((A)[0]))
 
 inline u32 __rotl(u32 x, int shift) {
-    shift &= 31;
-    if (!shift) return x;
-    return (x << shift) | (x >> (32 - shift));
+	shift &= 31;
+	if (!shift) return x;
+	return (x << shift) | (x >> (32 - shift));
 }
 
 inline u64 __rotl64(u64 x, unsigned int shift){

From 52517ab609b8d3c940d532406cfe131278417243 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 22:32:08 +0200
Subject: [PATCH 22/77] Fix the fix

---
 Common/CommonFuncs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/CommonFuncs.h b/Common/CommonFuncs.h
index 92e15bfc6f73..a533e2df6ad7 100644
--- a/Common/CommonFuncs.h
+++ b/Common/CommonFuncs.h
@@ -29,9 +29,9 @@ template<> struct CompileTimeAssert<true> {};
 #include <errno.h>
 
 #if defined(_M_IX86) || defined(_M_X86)
-#include <signal.h>
 #define Crash() {asm ("int $3");}
 #else
+#include <signal.h>
 #define Crash() {kill(getpid(), SIGINT);}
 #endif
 

From 5dbac165f434ac0f1699c752e5768ea47655c207 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 14:06:42 -0700
Subject: [PATCH 23/77] Qt, gcc, and Symbian buildfixes.

---
 Core/MIPS/IR/IRJit.h        | 8 ++++----
 Core/MIPS/IR/IRRegCache.cpp | 1 +
 Qt/mainwindow.h             | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 65da0f8e26a1..e9881f6a6e0b 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -57,10 +57,10 @@ class IRBlock {
 	void SetInstructions(const std::vector<IRInst> &inst, const std::vector<u32> &constants) {
 		instr_ = new IRInst[inst.size()];
 		numInstructions_ = (u16)inst.size();
-		memcpy(instr_, inst.data(), sizeof(IRInst) * inst.size());
+		memcpy(instr_, &inst[0], sizeof(IRInst) * inst.size());
 		const_ = new u32[constants.size()];
 		numConstants_ = (u16)constants.size();
-		memcpy(const_, constants.data(), sizeof(u32) * constants.size());
+		memcpy(const_, &constants[0], sizeof(u32) * constants.size());
 	}
 
 	const IRInst *GetInstructions() const { return instr_; }
@@ -85,13 +85,13 @@ class IRBlockCache {
 	void InvalidateICache(u32 addess, u32 length);
 	int GetNumBlocks() const { return (int)blocks_.size(); }
 	int AllocateBlock(int emAddr) {
-		blocks_.emplace_back(IRBlock(emAddr));
+		blocks_.push_back(IRBlock(emAddr));
 		size_ = (int)blocks_.size();
 		return (int)blocks_.size() - 1;
 	}
 	IRBlock *GetBlock(int i) {
 		if (i >= 0 && i < size_) {
-			return blocks_.data() + i;
+			return &blocks_[i];
 		} else {
 			return nullptr;
 		}
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index 09aeeb9c9026..aa0aab21acf6 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -1,3 +1,4 @@
+#include <cstring>
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRInst.h"
 
diff --git a/Qt/mainwindow.h b/Qt/mainwindow.h
index bf8bdf2a30f9..ae201054aa1f 100644
--- a/Qt/mainwindow.h
+++ b/Qt/mainwindow.h
@@ -87,7 +87,7 @@ private slots:
 
 	// Options
 	// Core
-	void dynarecAct() { g_Config.bJit = !g_Config.bJit; }
+	void dynarecAct() { g_Config.iCpuCore = g_Config.iCpuCore == CPU_CORE_INTERPRETER ? CPU_CORE_JIT : CPU_CORE_INTERPRETER; }
 	void vertexDynarecAct() { g_Config.bVertexDecoderJit = !g_Config.bVertexDecoderJit; }
 	void fastmemAct() { g_Config.bFastMemory = !g_Config.bFastMemory; }
 	void ignoreIllegalAct() { g_Config.bIgnoreBadMemAccess = !g_Config.bIgnoreBadMemAccess; }

From d4480d50fdad77788e7c0fc717a7c312b79ff6b2 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 22:54:07 +0200
Subject: [PATCH 24/77] jit-ir: Less instructions cause flushing in constant
 propagation.

---
 Core/MIPS/IR/IRPassSimplify.cpp | 36 ++++++++++++++++++++++++++++++++-
 Core/MIPS/IR/IRRegCache.cpp     |  6 ++++++
 Core/MIPS/IR/IRRegCache.h       |  1 +
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 9110ae55f739..efa8c6cdcaa5 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -62,7 +62,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		bool symmetric = true;
 		if (out.GetConstants().size() > 128) {
 			// Avoid causing a constant explosion.
-			goto doDefault;
+			goto doDefaultAndFlush;
 		}
 		switch (inst.op) {
 		case IROp::SetConst:
@@ -134,6 +134,39 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::Mult:
+		case IROp::MultU:
+			gpr.MapInIn(inst.src1, inst.src2);
+			goto doDefault;
+
+		case IROp::MovZ:
+		case IROp::MovNZ:
+			gpr.MapInInIn(inst.dest, inst.src1, inst.src2);
+			goto doDefault;
+
+		case IROp::Min:
+		case IROp::Max:
+			gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+			goto doDefault;
+
+		case IROp::Clz:
+		case IROp::BSwap16:
+		case IROp::BSwap32:
+		case IROp::Ext16to32:
+		case IROp::Ext8to32:
+			gpr.MapDirtyIn(inst.dest, inst.src1);
+			goto doDefault;
+
+		case IROp::MfHi:
+		case IROp::MfLo:
+			gpr.MapDirty(inst.dest);
+			goto doDefault;
+
+		case IROp::MtHi:
+		case IROp::MtLo:
+			gpr.MapIn(inst.src1);
+			goto doDefault;
+
 		case IROp::Store8:
 		case IROp::Store16:
 		case IROp::Store32:
@@ -175,6 +208,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::ExitToConstIfLtZ:
 		default:
 		{
+		doDefaultAndFlush:
 			gpr.FlushAll();
 		doDefault:
 			// Remap constants to the new reality
diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp
index aa0aab21acf6..63ae135878a1 100644
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@@ -45,6 +45,12 @@ void IRRegCache::MapInIn(int rs, int rt) {
 	Flush(rt);
 }
 
+void IRRegCache::MapInInIn(int rd, int rs, int rt) {
+	Flush(rd);
+	Flush(rs);
+	Flush(rt);
+}
+
 void IRRegCache::MapDirtyIn(int rd, int rs) {
 	if (rs != rd) {
 		Discard(rd);
diff --git a/Core/MIPS/IR/IRRegCache.h b/Core/MIPS/IR/IRRegCache.h
index 68570f50acf5..9fcdab8b1e85 100644
--- a/Core/MIPS/IR/IRRegCache.h
+++ b/Core/MIPS/IR/IRRegCache.h
@@ -35,6 +35,7 @@ class IRRegCache {
 	void MapDirty(int rd);
 	void MapIn(int rd);
 	void MapInIn(int rs, int rt);
+	void MapInInIn(int rd, int rs, int rt);
 	void MapDirtyIn(int rd, int rs);
 	void MapDirtyInIn(int rd, int rs, int rt);
 

From c7e4658b6d9fa0fc1ea1f6116767c7f091f35686 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sun, 8 May 2016 23:25:47 +0200
Subject: [PATCH 25/77] More constant propagation

---
 Core/MIPS/IR/IRPassSimplify.cpp | 38 ++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index efa8c6cdcaa5..bceff578e258 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -157,6 +157,15 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			gpr.MapDirtyIn(inst.dest, inst.src1);
 			goto doDefault;
 
+		case IROp::FMovFromGPR:
+			if (gpr.IsImm(inst.src1)) {
+				out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
+			} else {
+				gpr.MapIn(inst.src1);
+				goto doDefault;
+			}
+			break;
+
 		case IROp::MfHi:
 		case IROp::MfLo:
 			gpr.MapDirty(inst.dest);
@@ -174,11 +183,18 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 				gpr.MapIn(inst.dest);
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
-				// Just pass through, no excessive flushing
 				gpr.MapInIn(inst.dest, inst.src1);
 				goto doDefault;
 			}
 			break;
+		case IROp::StoreFloat:
+			if (gpr.IsImm(inst.src1)) {
+				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
+			} else {
+				gpr.MapIn(inst.src1);
+				goto doDefault;
+			}
+			break;
 
 		case IROp::Load8:
 		case IROp::Load8Ext:
@@ -193,6 +209,26 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 				goto doDefault;
 			}
 			break;
+		case IROp::LoadFloat:
+			if (gpr.IsImm(inst.src1)) {
+				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
+			} else {
+				gpr.MapIn(inst.src1);
+				goto doDefault;
+			}
+			break;
+
+		// FP-only instructions don't need to flush immediates.
+		case IROp::FAdd:
+		case IROp::FMul:
+		case IROp::FDiv:
+		case IROp::FSub:
+		case IROp::FNeg:
+		case IROp::FAbs:
+		case IROp::FSqrt:
+		case IROp::FMov:
+			out.Write(inst);
+			break;
 
 		case IROp::Syscall:
 		case IROp::Interpret:

From d19174b52b82b56f9f35265b1c5c7f319dbd9a28 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 14:56:43 -0700
Subject: [PATCH 26/77] jit-ir: Skip const flush on downcount op.

This allows discarding more unused constants.
---
 Core/MIPS/IR/IRPassSimplify.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index bceff578e258..886ea35a174f 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -218,6 +218,10 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::Downcount:
+			out.Write(inst);
+			break;
+
 		// FP-only instructions don't need to flush immediates.
 		case IROp::FAdd:
 		case IROp::FMul:

From 0d7f15116761922d60dab15e6f4524bac9f9d23b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 16:08:35 -0700
Subject: [PATCH 27/77] jit-ir: Add a utility func for applying passes.

---
 Core/MIPS/IR/IRInst.h           | 11 +++++++++++
 Core/MIPS/IR/IRJit.cpp          |  9 ++++-----
 Core/MIPS/IR/IRPassSimplify.cpp | 25 +++++++++++++++++++++++++
 Core/MIPS/IR/IRPassSimplify.h   |  2 ++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index a25996590607..def7185d7a2e 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -243,6 +243,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 // Each IR block gets a constant pool.
 class IRWriter {
 public:
+	IRWriter &operator =(const IRWriter &w) {
+		insts_ = w.insts_;
+		constPool_ = w.constPool_;
+		return *this;
+	}
+	IRWriter &operator =(IRWriter &&w) {
+		insts_ = std::move(w.insts_);
+		constPool_ = std::move(w.constPool_);
+		return *this;
+	}
+
 	void Write(IROp op, u8 dst = 0, u8 src1 = 0, u8 src2 = 0);
 	void Write(IRInst inst) {
 		insts_.push_back(inst);
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 668da61a362f..a7223bcc317a 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -248,15 +248,14 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 	ir.Simplify();
 
 	IRWriter simplified;
-
 	IRWriter *code = &ir;
 	if (true) {
-		if (PropagateConstants(ir, simplified))
+		static const IRPassFunc passes[] = {
+			&PropagateConstants,
+		};
+		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
 		code = &simplified;
-		// Some blocks in tekken generate curious numbers of constants after propagation.
-		//if (ir.GetConstants().size() >= 64)
-		//	logBlocks = 1;
 	}
 
 	b->SetInstructions(code->GetInstructions(), code->GetConstants());
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 886ea35a174f..7f2e765b58c7 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -51,6 +51,31 @@ IROp ArithToArithConst(IROp op) {
 	}
 }
 
+bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out) {
+	if (c == 1) {
+		return passes[0](in, out);
+	}
+
+	bool logBlocks = false;
+
+	IRWriter temp[2];
+	const IRWriter *nextIn = &in;
+	IRWriter *nextOut = &temp[1];
+	for (size_t i = 0; i < c - 1; ++i) {
+		if (passes[i](*nextIn, *nextOut)) {
+			logBlocks = true;
+		}
+
+		temp[0] = std::move(temp[1]);
+		nextIn = &temp[0];
+	}
+
+	if (passes[c - 1](*nextIn, out)) {
+		logBlocks = true;
+	}
+
+	return logBlocks;
+}
 
 bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	IRRegCache gpr(&out);
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index 5a57be1cfae4..efba1749eb97 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -5,5 +5,7 @@
 // Dumb example of a simplification pass that can't add or remove instructions.
 void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);
 
+typedef bool (*IRPassFunc)(const IRWriter &in, IRWriter &out);
+bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out);
 
 bool PropagateConstants(const IRWriter &in, IRWriter &out);
\ No newline at end of file

From d09f3a22a8d288aa1feab6b8b037cead76b1c7ca Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 16:19:48 -0700
Subject: [PATCH 28/77] jit-ir: Propagate single-op constants.

---
 Core/MIPS/IR/IRPassSimplify.cpp | 45 +++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 7f2e765b58c7..f6002fb46ab9 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -37,6 +37,28 @@ u32 Evaluate(u32 a, u32 b, IROp op) {
 	}
 }
 
+u32 Evaluate(u32 a, IROp op) {
+	switch (op) {
+	case IROp::Not: return ~a;
+	case IROp::Neg: return -(s32)a;
+	case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8);
+	case IROp::BSwap32: return swap32(a);
+	case IROp::Ext8to32: return (u32)(s32)(s8)(u8)a;
+	case IROp::Ext16to32: return (u32)(s32)(s16)(u16)a;
+	case IROp::Clz: {
+		int x = 31;
+		int count = 0;
+		while (x >= 0 && !(a & (1 << x))) {
+			count++;
+			x--;
+		}
+		return count;
+	}
+	default:
+		return -1;
+	}
+}
+
 IROp ArithToArithConst(IROp op) {
 	switch (op) {
 	case IROp::Add: return IROp::AddConst;
@@ -121,6 +143,21 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::Neg:
+		case IROp::Not:
+		case IROp::BSwap16:
+		case IROp::BSwap32:
+		case IROp::Ext8to32:
+		case IROp::Ext16to32:
+		case IROp::Clz:
+			if (gpr.IsImm(inst.src1)) {
+				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op));
+			} else {
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				goto doDefault;
+			}
+			break;
+
 		case IROp::AddConst:
 		case IROp::SubConst:
 		case IROp::AndConst:
@@ -174,14 +211,6 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
 			goto doDefault;
 
-		case IROp::Clz:
-		case IROp::BSwap16:
-		case IROp::BSwap32:
-		case IROp::Ext16to32:
-		case IROp::Ext8to32:
-			gpr.MapDirtyIn(inst.dest, inst.src1);
-			goto doDefault;
-
 		case IROp::FMovFromGPR:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));

From a22ff68e9ee0735811dfe95b34eafe65e53108fc Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 18:48:05 -0700
Subject: [PATCH 29/77] jit-ir: Skip flushing when updating PC.

---
 Core/MIPS/IR/IRPassSimplify.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index f6002fb46ab9..3ea09d11c52e 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -273,7 +273,16 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 
 		case IROp::Downcount:
-			out.Write(inst);
+		case IROp::SetPCConst:
+			goto doDefault;
+
+		case IROp::SetPC:
+			if (gpr.IsImm(inst.src1)) {
+				out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1)));
+			} else {
+				gpr.MapIn(inst.src1);
+				goto doDefault;
+			}
 			break;
 
 		// FP-only instructions don't need to flush immediates.

From a1b4b5170c0295fe454e5d757278978358e5ba7c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 18:59:32 -0700
Subject: [PATCH 30/77] jit-ir: Propagate constants even for overlaps.

---
 Core/MIPS/IR/IRPassSimplify.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 3ea09d11c52e..02aea8fff8d2 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -126,17 +126,26 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::Xor:
 			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
-			} else if (gpr.IsImm(inst.src2) && inst.src1 != inst.src2 && inst.dest != inst.src2) {
+			} else if (gpr.IsImm(inst.src2)) {
+				const u32 imm2 = gpr.GetImm(inst.src2);
 				gpr.MapDirtyIn(inst.dest, inst.src1);
-				if (gpr.GetImm(inst.src2) == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
+				if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
+					// Add / Or with zero is just a Mov.
 					if (inst.dest != inst.src1)
 						out.Write(IROp::Mov, inst.dest, inst.src1);
 				} else {
-					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(gpr.GetImm(inst.src2)));
+					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2));
 				}
-			} else if (symmetric && gpr.IsImm(inst.src1) && inst.src1 != inst.src2 && inst.dest != inst.src2) {
+			} else if (symmetric && gpr.IsImm(inst.src1)) {
+				const u32 imm1 = gpr.GetImm(inst.src1);
 				gpr.MapDirtyIn(inst.dest, inst.src2);
-				out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(gpr.GetImm(inst.src1)));
+				if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or)) {
+					// Add / Or with zero is just a Mov.
+					if (inst.dest != inst.src2)
+						out.Write(IROp::Mov, inst.dest, inst.src2);
+				} else {
+					out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1));
+				}
 			} else {
 				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
 				goto doDefault;

From 5221a02db4188f6c27cae682beaada8a04dad84b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 19:11:58 -0700
Subject: [PATCH 31/77] jit-ir: Propagate constants for shifts.

This might optimize away an IRTEMP_0 in such cases.
---
 Core/MIPS/IR/IRCompALU.cpp      |  6 +++---
 Core/MIPS/IR/IRPassSimplify.cpp | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 82053dc63fa3..4fbec417ba71 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -182,13 +182,13 @@ void IRJit::Comp_RType3(MIPSOpcode op) {
 	}
 }
 
-void IRJit::CompShiftImm(MIPSOpcode op, IROp shiftOpConst, int sa) {
+void IRJit::CompShiftImm(MIPSOpcode op, IROp shiftOpImm, int sa) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
-	ir.Write(shiftOpConst, rd, rt, sa);
+	ir.Write(shiftOpImm, rd, rt, sa);
 }
 
-void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpConst) {
+void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpImm) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 02aea8fff8d2..6a6df0e5b17a 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -73,6 +73,17 @@ IROp ArithToArithConst(IROp op) {
 	}
 }
 
+IROp ShiftToShiftImm(IROp op) {
+	switch (op) {
+	case IROp::Shl: return IROp::ShlImm;
+	case IROp::Shr: return IROp::ShrImm;
+	case IROp::Ror: return IROp::RorImm;
+	case IROp::Sar: return IROp::SarImm;
+	default:
+		return (IROp)-1;
+	}
+}
+
 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out) {
 	if (c == 1) {
 		return passes[0](in, out);
@@ -182,6 +193,27 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::Shl:
+		case IROp::Shr:
+		case IROp::Ror:
+		case IROp::Sar:
+			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
+				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
+			} else if (gpr.IsImm(inst.src2)) {
+				const u8 sa = gpr.GetImm(inst.src2) & 31;
+				gpr.MapDirtyIn(inst.dest, inst.src1);
+				if (sa == 0) {
+					if (inst.dest != inst.src1)
+						out.Write(IROp::Mov, inst.dest, inst.src1);
+				} else {
+					out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa);
+				}
+			} else {
+				gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+				goto doDefault;
+			}
+			break;
+
 		case IROp::ShlImm:
 		case IROp::ShrImm:
 		case IROp::RorImm:

From 6bd31ecb272409d5742df7c5dff4dfb87a56dad3 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 21:25:34 -0700
Subject: [PATCH 32/77] jit-ir: Flush consts better for a few f/v ops.

---
 Core/MIPS/IR/IRInst.h           |  2 ++
 Core/MIPS/IR/IRPassSimplify.cpp | 47 +++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index def7185d7a2e..a3739898b893 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -213,6 +213,8 @@ enum {
 	IRTEMP_RHS,  // Reserved for use in branches
 
 	// Hacky way to get to other state
+	IRREG_VPFU_CTRL_BASE = 208,
+	IRREG_VPFU_CC = 211,
 	IRREG_LO = 226,  // offset of lo in MIPSState / 4
 	IRREG_HI = 227,
 	IRREG_FCR31 = 228,
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 6a6df0e5b17a..a6a939cd469d 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -261,6 +261,23 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::VMovFromGPR:
+			if (gpr.IsImm(inst.src1)) {
+				out.Write(IROp::SetConstV, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
+			} else {
+				gpr.MapIn(inst.src1);
+				goto doDefault;
+			}
+			break;
+
+		case IROp::FMovToGPR:
+			gpr.MapDirty(inst.dest);
+			goto doDefault;
+
+		case IROp::VMovToGPR:
+			gpr.MapDirty(inst.dest);
+			goto doDefault;
+
 		case IROp::MfHi:
 		case IROp::MfLo:
 			gpr.MapDirty(inst.dest);
@@ -283,6 +300,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 		case IROp::StoreFloat:
+		case IROp::StoreFloatV:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@@ -305,6 +323,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 		case IROp::LoadFloat:
+		case IROp::LoadFloatV:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@@ -335,9 +354,37 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::FAbs:
 		case IROp::FSqrt:
 		case IROp::FMov:
+		case IROp::FRound:
+		case IROp::FTrunc:
+		case IROp::FCeil:
+		case IROp::FFloor:
+		case IROp::FCvtSW:
+			out.Write(inst);
+			break;
+
+		case IROp::SetCtrlVFPU:
+			goto doDefault;
+
+		case IROp::FCvtWS:
+			// TODO: Actually, this should just use the currently set rounding mode.
+			// Move up with FCvtSW when that's implemented.
+			gpr.MapIn(IRREG_FCR31);
 			out.Write(inst);
 			break;
 
+		case IROp::FpCondToReg:
+			if (gpr.IsImm(IRREG_FPCOND)) {
+				gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND));
+			} else {
+				gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
+				out.Write(inst);
+			}
+			break;
+
+		case IROp::VfpuCtrlToReg:
+			gpr.MapDirtyIn(inst.dest, IRREG_VPFU_CTRL_BASE + inst.src1);
+			goto doDefault;
+
 		case IROp::Syscall:
 		case IROp::Interpret:
 		case IROp::ExitToConst:

From 7ce923d01f17926bcec8cc1b9f5217ad832f4397 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 22:11:53 -0700
Subject: [PATCH 33/77] jit-ir: Fix SetConstV.

Oops.
---
 Core/MIPS/IR/IRInterpreter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 0c3c66188c9d..9b0c9eb1eb6b 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -21,7 +21,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
 			break;
 		case IROp::SetConstV:
-			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
+			memcpy(&mips->v[voffset[inst->dest]], &constPool[inst->src1], 4);
 			break;
 		case IROp::Add:
 			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];

From f638477b9a4007ce7d8dcdecf44a4e8db56f4f48 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 8 May 2016 22:17:39 -0700
Subject: [PATCH 34/77] jit-ir: Add the rest to PropagateConstants.

Just for completeness.
---
 Core/MIPS/IR/IRPassSimplify.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index a6a939cd469d..3285ff194d89 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -126,6 +126,9 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::SetConst:
 			gpr.SetImm(inst.dest, constants[inst.src1]);
 			break;
+		case IROp::SetConstF:
+		case IROp::SetConstV:
+			goto doDefault;
 
 		case IROp::Sub:
 		case IROp::Slt:
@@ -381,6 +384,22 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::ZeroFpCond:
+		case IROp::FCmpUnordered:
+		case IROp::FCmpEqual:
+		case IROp::FCmpEqualUnordered:
+		case IROp::FCmpLessOrdered:
+		case IROp::FCmpLessUnordered:
+		case IROp::FCmpLessEqualOrdered:
+		case IROp::FCmpLessEqualUnordered:
+			gpr.MapDirty(IRREG_FPCOND);
+			goto doDefault;
+
+		case IROp::RestoreRoundingMode:
+		case IROp::ApplyRoundingMode:
+		case IROp::UpdateRoundingMode:
+			goto doDefault;
+
 		case IROp::VfpuCtrlToReg:
 			gpr.MapDirtyIn(inst.dest, IRREG_VPFU_CTRL_BASE + inst.src1);
 			goto doDefault;

From f6d245f3c4a5ce32fc978652ac05578deff3858e Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 9 May 2016 00:13:01 -0700
Subject: [PATCH 35/77] jit-ir: Remove redundant simplify pass.

This is just doing the same thing as the const folding pass, really.
---
 Core/MIPS/IR/IRInst.cpp         |  4 ----
 Core/MIPS/IR/IRInst.h           |  2 --
 Core/MIPS/IR/IRJit.cpp          |  2 --
 Core/MIPS/IR/IRPassSimplify.cpp | 19 +------------------
 Core/MIPS/IR/IRPassSimplify.h   |  3 ---
 5 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 66ee7561f2cb..6b3231ce6d9c 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -143,10 +143,6 @@ int IRWriter::AddConstantFloat(float value) {
 	return AddConstant(val);
 }
 
-void IRWriter::Simplify() {
-	SimplifyInPlace(&insts_[0], (int)insts_.size(), constPool_.data());
-}
-
 const char *GetGPRName(int r) {
 	if (r < 32) {
 		return currentDebugMIPS->GetRegName(0, r);
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index a3739898b893..77d71ed91534 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -270,8 +270,6 @@ class IRWriter {
 		constPool_.clear();
 	}
 
-	void Simplify();
-
 	const std::vector<IRInst> &GetInstructions() const { return insts_; }
 	const std::vector<u32> &GetConstants() const { return constPool_; }
 
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index a7223bcc317a..76b58161d3e2 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -245,8 +245,6 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		}
 	}
 
-	ir.Simplify();
-
 	IRWriter simplified;
 	IRWriter *code = &ir;
 	if (true) {
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 3285ff194d89..b296a5b3a343 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -1,24 +1,7 @@
+#include "Common/Log.h"
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 
-void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool) {
-	for (int i = 0; i < count; i++) {
-		switch (inst[i].op) {
-		case IROp::AddConst:
-			if (constPool[inst[i].src2] == 0)
-				inst[i].op = IROp::Mov;
-			else if (inst[i].src1 == 0) {
-				inst[i].op = IROp::SetConst;
-				inst[i].src1 = inst[i].src2;
-			}
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-
 u32 Evaluate(u32 a, u32 b, IROp op) {
 	switch (op) {
 	case IROp::Add: case IROp::AddConst: return a + b;
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index efba1749eb97..5bf3f53fb9eb 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -2,9 +2,6 @@
 
 #include "Core/MIPS/IR/IRInst.h"
 
-// Dumb example of a simplification pass that can't add or remove instructions.
-void SimplifyInPlace(IRInst *inst, int count, const u32 *constPool);
-
 typedef bool (*IRPassFunc)(const IRWriter &in, IRWriter &out);
 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out);
 

From ccb8f8d77e5fef95b2cde188b1f303d40ac466af Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 9 May 2016 00:36:38 -0700
Subject: [PATCH 36/77] jit-ir: Fix replacement hooks.

---
 Core/MIPS/IR/IRInterpreter.cpp | 2 +-
 Core/MIPS/IR/IRJit.cpp         | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 9b0c9eb1eb6b..03b06c77e129 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -384,7 +384,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
 			int cycles = f->replaceFunc();
 			mips->downcount -= cycles;
-			return mips->r[MIPS_REG_RA];
+			break;
 		}
 
 		case IROp::Break:
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 76b58161d3e2..e5bddd6c8984 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -340,6 +340,8 @@ void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
 			MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
 		} else {
 			ApplyRoundingMode();
+			ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+			ir.Write(IROp::ExitToReg, MIPS_REG_RA, 0, 0);
 			js.compiling = false;
 		}
 	} else {

From eb6551d72a71288477abdef92a7aebf7ba5dba59 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 9 May 2016 01:10:04 -0700
Subject: [PATCH 37/77] jit-ir: Correct downcount handling.

Oops, was wrong - already accounted for delay slots.  Clear so we don't
double count when emitting a syscall.

Fixes FF4 utility msg flickering.
---
 Core/MIPS/IR/IRCompBranch.cpp | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index e2d6c99c8523..acfdfaffe2e1 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -66,8 +66,9 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs);
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	MIPSGPReg lhs = rs;
 	MIPSGPReg rhs = rt;
@@ -109,8 +110,9 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	MIPSGPReg lhs = rs;
 	if (!delaySlotIsNice) {  // if likely, we don't need this
@@ -184,8 +186,9 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (!likely)
 		CompileDelaySlot();
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	FlushAll();
 	// Not taken
@@ -223,8 +226,9 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	logBlocks = 1;
 	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_LHS, VFPU_CTRL_CC);
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	// Sometimes there's a VFPU branch in a delay slot (Disgaea 2: Dark Hero Days, Zettai Hero Project, La Pucelle)
 	// The behavior is undefined - the CPU may take the second branch even if the first one passes.
@@ -271,8 +275,9 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 	u32 off = _IMM26 << 2;
 	u32 targetAddr = (GetCompilerPC() & 0xF0000000) | off;
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	// Might be a stubbed address or something?
 	if (!Memory::IsValidAddress(targetAddr)) {
@@ -320,8 +325,9 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 	if (andLink && rs == rd)
 		delaySlotIsNice = false;
 
-	int dcAmount = js.downcountAmount + 1;
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	int destReg;
 	if (IsSyscall(delaySlotOp)) {
@@ -363,18 +369,18 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 }
 
 void IRJit::Comp_Syscall(MIPSOpcode op) {
-	// If we're in a delay slot, this is off by one.
-	const int offset = js.inDelaySlot ? -1 : 0;
 	RestoreRoundingMode();
-	js.downcountAmount = -offset;
 
-	int dcAmount = js.downcountAmount + 1;
+	// Note: If we're in a delay slot, this is off by one compared to the interpreter.
+	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
+	js.downcountAmount = 0;
 
 	FlushAll();
 
 	ir.Write(IROp::Syscall, 0, ir.AddConstant(op.encoding));
 
+	// TODO: This never happens because of Syscall exiting.
 	ApplyRoundingMode();
 	js.compiling = false;
 }

From f50617d67924b32c6193648cc74551d9bb93db84 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 00:37:08 +0200
Subject: [PATCH 38/77] Skip const flush on set float constant

---
 Core/MIPS/IR/IRPassSimplify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index b296a5b3a343..929eeed9759f 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -427,4 +427,4 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		}
 	}
 	return logBlocks;
-}
\ No newline at end of file
+}

From 6e44e97ffa48ffa8fbe0e4896e6696fb0851fb7d Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 19:25:51 +0200
Subject: [PATCH 39/77] Refactor prep: Split JitInterface into
 MIPSFrontendInterface and JitInterface

---
 Core/HLE/ReplaceTables.cpp      |  2 +-
 Core/MIPS/ARM/ArmJit.h          |  2 +-
 Core/MIPS/ARM64/Arm64Jit.h      |  2 +-
 Core/MIPS/IR/IRJit.cpp          |  4 ++-
 Core/MIPS/IR/IRJit.h            |  2 +-
 Core/MIPS/IR/IRPassSimplify.cpp |  4 +++
 Core/MIPS/JitCommon/JitCommon.h | 46 +++++++++++++++++++--------------
 Core/MIPS/MIPS/MipsJit.h        |  3 ++-
 Core/MIPS/MIPSTables.cpp        |  5 ++--
 Core/MIPS/MIPSTables.h          |  4 +--
 Core/MIPS/x86/Jit.h             |  2 +-
 11 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/Core/HLE/ReplaceTables.cpp b/Core/HLE/ReplaceTables.cpp
index 06b5392d9d51..47a8b2d2ad6e 100644
--- a/Core/HLE/ReplaceTables.cpp
+++ b/Core/HLE/ReplaceTables.cpp
@@ -1109,7 +1109,7 @@ static int Hook_omertachinmokunookitethelegacy_download_frame() {
 	return 0;
 }
 
-#define JITFUNC(f) (&MIPSComp::JitInterface::f)
+#define JITFUNC(f) (&MIPSComp::MIPSFrontendInterface::f)
 
 // Can either replace with C functions or functions emitted in Asm/ArmAsm.
 static const ReplacementTableEntry entries[] = {
diff --git a/Core/MIPS/ARM/ArmJit.h b/Core/MIPS/ARM/ArmJit.h
index efdde624bf36..7ec62e04c87a 100644
--- a/Core/MIPS/ARM/ArmJit.h
+++ b/Core/MIPS/ARM/ArmJit.h
@@ -33,7 +33,7 @@
 
 namespace MIPSComp {
 
-class ArmJit : public ArmGen::ARMXCodeBlock, public JitInterface {
+class ArmJit : public ArmGen::ARMXCodeBlock, public JitInterface, public MIPSFrontendInterface {
 public:
 	ArmJit(MIPSState *mips);
 	virtual ~ArmJit();
diff --git a/Core/MIPS/ARM64/Arm64Jit.h b/Core/MIPS/ARM64/Arm64Jit.h
index e341df3e7989..4aec5feccacb 100644
--- a/Core/MIPS/ARM64/Arm64Jit.h
+++ b/Core/MIPS/ARM64/Arm64Jit.h
@@ -33,7 +33,7 @@
 
 namespace MIPSComp {
 
-class Arm64Jit : public Arm64Gen::ARM64CodeBlock, public JitInterface {
+class Arm64Jit : public Arm64Gen::ARM64CodeBlock, public JitInterface, public MIPSFrontendInterface {
 public:
 	Arm64Jit(MIPSState *mips);
 	virtual ~Arm64Jit();
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index e5bddd6c8984..ee5f5c8206b9 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -254,6 +254,8 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
 		code = &simplified;
+		if (ir.GetInstructions().size() >= 24)
+			logBlocks = 1;
 	}
 
 	b->SetInstructions(code->GetInstructions(), code->GetConstants());
@@ -408,4 +410,4 @@ MIPSOpcode IRJit::GetOriginalOp(MIPSOpcode op) {
 	return b->GetOriginalFirstOp();
 }
 
-}  // namespace MIPSComp
\ No newline at end of file
+}  // namespace MIPSComp
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index e9881f6a6e0b..76a27d1a4bef 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -101,7 +101,7 @@ class IRBlockCache {
 	std::vector<IRBlock> blocks_;
 };
 
-class IRJit : public JitInterface {
+class IRJit : public JitInterface, public MIPSFrontendInterface{
 public:
 	IRJit(MIPSState *mips);
 	virtual ~IRJit();
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 929eeed9759f..0e5353ff5717 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -121,6 +121,10 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::And:
 		case IROp::Or:
 		case IROp::Xor:
+			// Regularize, for the add/or check below.
+			if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) {
+				std::swap(inst.src1, inst.src2);
+			}
 			if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) {
 				gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op));
 			} else if (gpr.IsImm(inst.src2)) {
diff --git a/Core/MIPS/JitCommon/JitCommon.h b/Core/MIPS/JitCommon/JitCommon.h
index e27707ea3558..62a91a30c209 100644
--- a/Core/MIPS/JitCommon/JitCommon.h
+++ b/Core/MIPS/JitCommon/JitCommon.h
@@ -42,27 +42,11 @@ class MIPSState;
 namespace MIPSComp {
 	void JitAt();
 
-	class JitInterface {
+	class MIPSFrontendInterface {
 	public:
-		virtual ~JitInterface() {}
+		virtual ~MIPSFrontendInterface() {}
 
-		virtual bool DescribeCodePtr(const u8 *ptr, std::string &name) = 0;
-		virtual const u8 *GetDispatcher() const = 0;
-		virtual JitBlockCache *GetBlockCache() = 0;
-		virtual void InvalidateCache() = 0;
-		virtual void InvalidateCacheAt(u32 em_address, int length = 4) = 0;
-		virtual void DoState(PointerWrap &p) = 0;
-		virtual void DoDummyState(PointerWrap &p) = 0;
-		virtual void RunLoopUntil(u64 globalticks) = 0;
-		virtual void Compile(u32 em_address) = 0;
-		virtual void ClearCache() = 0;
 		virtual void EatPrefix() = 0;
-		virtual MIPSOpcode GetOriginalOp(MIPSOpcode op) = 0;
-
-		// Block linking. This may need to work differently for whole-function JITs and stuff
-		// like that.
-		virtual void LinkBlock(u8 *exitPoint, const u8 *entryPoint) = 0;
-		virtual void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) = 0;
 
 		virtual void Comp_Generic(MIPSOpcode op) = 0;
 		virtual void Comp_RunBlock(MIPSOpcode op) = 0;
@@ -132,8 +116,30 @@ namespace MIPSComp {
 		virtual int Replace_fabsf() = 0;
 	};
 
-	typedef void (JitInterface::*MIPSCompileFunc)(MIPSOpcode opcode);
-	typedef int (JitInterface::*MIPSReplaceFunc)();
+	class JitInterface {
+	public:
+		virtual ~JitInterface() {}
+
+		virtual bool DescribeCodePtr(const u8 *ptr, std::string &name) = 0;
+		virtual const u8 *GetDispatcher() const = 0;
+		virtual JitBlockCache *GetBlockCache() = 0;
+		virtual void InvalidateCache() = 0;
+		virtual void InvalidateCacheAt(u32 em_address, int length = 4) = 0;
+		virtual void DoState(PointerWrap &p) = 0;
+		virtual void DoDummyState(PointerWrap &p) = 0;
+		virtual void RunLoopUntil(u64 globalticks) = 0;
+		virtual void Compile(u32 em_address) = 0;
+		virtual void ClearCache() = 0;
+		virtual MIPSOpcode GetOriginalOp(MIPSOpcode op) = 0;
+
+		// Block linking. This may need to work differently for whole-function JITs and stuff
+		// like that.
+		virtual void LinkBlock(u8 *exitPoint, const u8 *entryPoint) = 0;
+		virtual void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) = 0;
+	};
+
+	typedef void (MIPSFrontendInterface::*MIPSCompileFunc)(MIPSOpcode opcode);
+	typedef int (MIPSFrontendInterface::*MIPSReplaceFunc)();
 
 	extern JitInterface *jit;
 
diff --git a/Core/MIPS/MIPS/MipsJit.h b/Core/MIPS/MIPS/MipsJit.h
index 1fcb6faea6fd..8c5dbf9bd380 100644
--- a/Core/MIPS/MIPS/MipsJit.h
+++ b/Core/MIPS/MIPS/MipsJit.h
@@ -20,6 +20,7 @@
 #include "Common/MipsEmitter.h"
 using namespace MIPSGen;
 
+#include "Core/MIPS/JitCommon/JitCommon.h"
 #include "Core/MIPS/JitCommon/JitState.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 #include "../MIPSVFPUUtils.h"
@@ -31,7 +32,7 @@ using namespace MIPSGen;
 namespace MIPSComp
 {
 
-class MipsJit : public MIPSGen::MIPSCodeBlock, public JitInterface
+class MipsJit : public MIPSGen::MIPSCodeBlock, public JitInterface, public MIPSFrontendInterface
 {
 public:
 	MipsJit(MIPSState *mips);
diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp
index f0b51db0c0df..76e260b377fb 100644
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@@ -82,7 +82,7 @@ struct MIPSInstruction {
 #define ENCODING(a) {a}
 #define INSTR(name, comp, dis, inter, flags) {Instruc, name, comp, dis, inter, MIPSInfo(flags)}
 
-#define JITFUNC(f) (&JitInterface::f)
+#define JITFUNC(f) (&MIPSFrontendInterface::f)
 
 using namespace MIPSDis;
 using namespace MIPSInt;
@@ -912,7 +912,7 @@ const MIPSInstruction *MIPSGetInstruction(MIPSOpcode op) {
 	return instr;
 }
 
-void MIPSCompileOp(MIPSOpcode op, MIPSComp::JitInterface *jit) {
+void MIPSCompileOp(MIPSOpcode op, MIPSComp::MIPSFrontendInterface *jit) {
 	if (op == 0)
 		return;
 	const MIPSInstruction *instr = MIPSGetInstruction(op);
@@ -923,7 +923,6 @@ void MIPSCompileOp(MIPSOpcode op, MIPSComp::JitInterface *jit) {
 		} else {
 			ERROR_LOG_REPORT(CPU,"MIPSCompileOp %08x failed",op.encoding);
 		}
-
 		if (info & OUT_EAT_PREFIX)
 			jit->EatPrefix();
 	} else {
diff --git a/Core/MIPS/MIPSTables.h b/Core/MIPS/MIPSTables.h
index 39c8c06753d5..3987aa3e87f7 100644
--- a/Core/MIPS/MIPSTables.h
+++ b/Core/MIPS/MIPSTables.h
@@ -112,10 +112,10 @@ typedef void (CDECL *MIPSDisFunc)(MIPSOpcode opcode, char *out);
 typedef void (CDECL *MIPSInterpretFunc)(MIPSOpcode opcode);
 
 namespace MIPSComp {
-	class JitInterface;
+	class MIPSFrontendInterface;
 }
 
-void MIPSCompileOp(MIPSOpcode op, MIPSComp::JitInterface *jit);
+void MIPSCompileOp(MIPSOpcode op, MIPSComp::MIPSFrontendInterface *jit);
 void MIPSDisAsm(MIPSOpcode op, u32 pc, char *out, bool tabsToSpaces = false);
 MIPSInfo MIPSGetInfo(MIPSOpcode op);
 void MIPSInterpret(MIPSOpcode op); //only for those rare ones
diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h
index a6f44443311a..4206f4d307a5 100644
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@@ -46,7 +46,7 @@ struct RegCacheState {
 	FPURegCacheState fpr;
 };
 
-class Jit : public Gen::XCodeBlock, public JitInterface {
+class Jit : public Gen::XCodeBlock, public JitInterface, public MIPSFrontendInterface {
 public:
 	Jit(MIPSState *mips);
 	virtual ~Jit();

From e806c369b2f3b18ea6c8095ddbfe920fc4427b20 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 19:57:18 +0200
Subject: [PATCH 40/77] Separate the IR frontend from the IR "Jit"

---
 Core/MIPS/IR/IRCompALU.cpp       |  20 +++---
 Core/MIPS/IR/IRCompBranch.cpp    |  25 ++++---
 Core/MIPS/IR/IRCompFPU.cpp       |  10 +--
 Core/MIPS/IR/IRCompLoadStore.cpp |   6 +-
 Core/MIPS/IR/IRCompVFPU.cpp      |  82 +++++++++++-----------
 Core/MIPS/IR/IRJit.cpp           |  86 +++++++++++++-----------
 Core/MIPS/IR/IRJit.h             | 112 +++++++++++++------------------
 7 files changed, 162 insertions(+), 179 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 4fbec417ba71..6500d1b0df11 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -45,7 +45,7 @@ using namespace MIPSAnalyst;
 
 namespace MIPSComp {
 
-void IRJit::Comp_IType(MIPSOpcode op) {
+void IRFrontend::Comp_IType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	s32 simm = (s32)(s16)(op & 0xFFFF);  // sign extension
@@ -87,7 +87,7 @@ void IRJit::Comp_IType(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_RType2(MIPSOpcode op) {
+void IRFrontend::Comp_RType2(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	MIPSGPReg rs = _RS;
@@ -110,7 +110,7 @@ void IRJit::Comp_RType2(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_RType3(MIPSOpcode op) {
+void IRFrontend::Comp_RType3(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	MIPSGPReg rt = _RT;
@@ -182,13 +182,13 @@ void IRJit::Comp_RType3(MIPSOpcode op) {
 	}
 }
 
-void IRJit::CompShiftImm(MIPSOpcode op, IROp shiftOpImm, int sa) {
+void IRFrontend::CompShiftImm(MIPSOpcode op, IROp shiftOpImm, int sa) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	ir.Write(shiftOpImm, rd, rt, sa);
 }
 
-void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpImm) {
+void IRFrontend::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpImm) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
@@ -196,7 +196,7 @@ void IRJit::CompShiftVar(MIPSOpcode op, IROp shiftOp, IROp shiftOpImm) {
 	ir.Write(shiftOp, rd, rt, IRTEMP_0);
 }
 
-void IRJit::Comp_ShiftType(MIPSOpcode op) {
+void IRFrontend::Comp_ShiftType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rs = _RS;
 	MIPSGPReg rd = _RD;
@@ -221,7 +221,7 @@ void IRJit::Comp_ShiftType(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_Special3(MIPSOpcode op) {
+void IRFrontend::Comp_Special3(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rs = _RS;
 	MIPSGPReg rt = _RT;
@@ -260,7 +260,7 @@ void IRJit::Comp_Special3(MIPSOpcode op) {
 }
 
 
-void IRJit::Comp_Allegrex(MIPSOpcode op) {
+void IRFrontend::Comp_Allegrex(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rd = _RD;
@@ -284,7 +284,7 @@ void IRJit::Comp_Allegrex(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_Allegrex2(MIPSOpcode op) {
+void IRFrontend::Comp_Allegrex2(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rd = _RD;
@@ -305,7 +305,7 @@ void IRJit::Comp_Allegrex2(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_MulDivType(MIPSOpcode op) {
+void IRFrontend::Comp_MulDivType(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index acfdfaffe2e1..3dda003b562d 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -53,7 +53,7 @@ namespace MIPSComp
 {
 	using namespace Arm64Gen;
 
-void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
+void IRFrontend::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in RSRTComp delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -98,7 +98,7 @@ void IRJit::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	js.compiling = false;
 }
 
-void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely) {
+void IRFrontend::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in RSZeroComp delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -135,7 +135,7 @@ void IRJit::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool
 	js.compiling = false;
 }
 
-void IRJit::Comp_RelBranch(MIPSOpcode op) {
+void IRFrontend::Comp_RelBranch(MIPSOpcode op) {
 	// The CC flags here should be opposite of the actual branch becuase they skip the branching action.
 	switch (op >> 26) {
 	case 4: BranchRSRTComp(op, IRComparison::NotEqual, false); break;//beq
@@ -156,7 +156,7 @@ void IRJit::Comp_RelBranch(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_RelBranchRI(MIPSOpcode op) {
+void IRFrontend::Comp_RelBranchRI(MIPSOpcode op) {
 	switch ((op >> 16) & 0x1F) {
 	case 0: BranchRSZeroComp(op, IRComparison::GreaterEqual, false, false); break; //if ((s32)R(rs) <  0) DelayBranchTo(addr); else PC += 4; break;//bltz
 	case 1: BranchRSZeroComp(op, IRComparison::Less, false, false); break; //if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 4; break;//bgez
@@ -173,7 +173,7 @@ void IRJit::Comp_RelBranchRI(MIPSOpcode op) {
 }
 
 // If likely is set, discard the branch slot if NOT taken.
-void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
+void IRFrontend::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in FPFlag delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -201,7 +201,7 @@ void IRJit::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_FPUBranch(MIPSOpcode op) {
+void IRFrontend::Comp_FPUBranch(MIPSOpcode op) {
 	switch((op >> 16) & 0x1f) {
 	case 0:	BranchFPFlag(op, IRComparison::NotEqual, false); break;  // bc1f
 	case 1: BranchFPFlag(op, IRComparison::Equal, false); break;  // bc1t
@@ -214,7 +214,7 @@ void IRJit::Comp_FPUBranch(MIPSOpcode op) {
 }
 
 // If likely is set, discard the branch slot if NOT taken.
-void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
+void IRFrontend::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in VFPU delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -223,7 +223,6 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	u32 targetAddr = GetCompilerPC() + offset + 4;
 
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
-	logBlocks = 1;
 	ir.Write(IROp::VfpuCtrlToReg, IRTEMP_LHS, VFPU_CTRL_CC);
 
 	int dcAmount = js.downcountAmount;
@@ -257,7 +256,7 @@ void IRJit::BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_VBranch(MIPSOpcode op) {
+void IRFrontend::Comp_VBranch(MIPSOpcode op) {
 	switch ((op >> 16) & 3) {
 	case 0:	BranchVFPUFlag(op, IRComparison::NotEqual, false); break;  // bvf
 	case 1: BranchVFPUFlag(op, IRComparison::Equal,  false); break;  // bvt
@@ -266,7 +265,7 @@ void IRJit::Comp_VBranch(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_Jump(MIPSOpcode op) {
+void IRFrontend::Comp_Jump(MIPSOpcode op) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in Jump delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -311,7 +310,7 @@ void IRJit::Comp_Jump(MIPSOpcode op) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_JumpReg(MIPSOpcode op) {
+void IRFrontend::Comp_JumpReg(MIPSOpcode op) {
 	if (js.inDelaySlot) {
 		ERROR_LOG_REPORT(JIT, "Branch in JumpReg delay slot at %08x in block starting at %08x", GetCompilerPC(), js.blockStart);
 		return;
@@ -368,7 +367,7 @@ void IRJit::Comp_JumpReg(MIPSOpcode op) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_Syscall(MIPSOpcode op) {
+void IRFrontend::Comp_Syscall(MIPSOpcode op) {
 	RestoreRoundingMode();
 
 	// Note: If we're in a delay slot, this is off by one compared to the interpreter.
@@ -385,7 +384,7 @@ void IRJit::Comp_Syscall(MIPSOpcode op) {
 	js.compiling = false;
 }
 
-void IRJit::Comp_Break(MIPSOpcode op) {
+void IRFrontend::Comp_Break(MIPSOpcode op) {
 	ir.Write(IROp::Break);
 	js.compiling = false;
 }
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index b0ff42cf261c..1ca4a08e96ac 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -54,7 +54,7 @@
 
 namespace MIPSComp {
 
-void IRJit::Comp_FPU3op(MIPSOpcode op) {
+void IRFrontend::Comp_FPU3op(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	int ft = _FT;
@@ -72,7 +72,7 @@ void IRJit::Comp_FPU3op(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_FPULS(MIPSOpcode op) {
+void IRFrontend::Comp_FPULS(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	s32 offset = _IMM16;
 	int ft = _FT;
@@ -93,7 +93,7 @@ void IRJit::Comp_FPULS(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_FPUComp(MIPSOpcode op) {
+void IRFrontend::Comp_FPUComp(MIPSOpcode op) {
 	DISABLE;  // IROps not yet implemented
 
 	int opc = op & 0xF;
@@ -136,7 +136,7 @@ void IRJit::Comp_FPUComp(MIPSOpcode op) {
 	ir.Write(irOp, fs, ft);
 }
 
-void IRJit::Comp_FPU2op(MIPSOpcode op) {
+void IRFrontend::Comp_FPU2op(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	int fs = _FS;
@@ -192,7 +192,7 @@ void IRJit::Comp_FPU2op(MIPSOpcode op) {
 	}
 }
 
-void IRJit::Comp_mxc1(MIPSOpcode op) {
+void IRFrontend::Comp_mxc1(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 
 	int fs = _FS;
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
index 4e702a544f2a..41c76a1a7d83 100644
--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -65,11 +65,11 @@
 #define DISABLE { Comp_Generic(op); return; }
 
 namespace MIPSComp {
-	void IRJit::Comp_ITypeMemLR(MIPSOpcode op, bool load) {
+	void IRFrontend::Comp_ITypeMemLR(MIPSOpcode op, bool load) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_ITypeMem(MIPSOpcode op) {
+	void IRFrontend::Comp_ITypeMem(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
 
 		int offset = (signed short)(op & 0xFFFF);
@@ -124,7 +124,7 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::Comp_Cache(MIPSOpcode op) {
+	void IRFrontend::Comp_Cache(MIPSOpcode op) {
 //		int imm = (s16)(op & 0xFFFF);
 //		int rs = _RS;
 //		int addr = R(rs) + imm;
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 1f2623ac67e3..e6f5ca3a8757 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -51,7 +51,7 @@
 
 namespace MIPSComp {
 
-	void IRJit::Comp_VPFX(MIPSOpcode op)	{
+	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
 		CONDITIONAL_DISABLE;
 		int data = op & 0xFFFFF;
 		int regnum = (op >> 24) & 3;
@@ -74,7 +74,7 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
+	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
 		if (prefix == 0xE4)
 			return;
 
@@ -128,7 +128,7 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
+	void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 
 		GetVectorRegs(regs, sz, vectorReg);
@@ -143,7 +143,7 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
+	void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 		if (!js.prefixD)
 			return;
@@ -176,11 +176,11 @@ namespace MIPSComp {
 		*/
 	}
 
-	void IRJit::Comp_SV(MIPSOpcode op) {
+	void IRFrontend::Comp_SV(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_SVQ(MIPSOpcode op) {
+	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
 		int imm = (signed short)(op & 0xFFFC);
 		int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
 		MIPSGPReg rs = _RS;
@@ -215,37 +215,37 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::Comp_VVectorInit(MIPSOpcode op) {
+	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VIdt(MIPSOpcode op) {
+	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VMatrixInit(MIPSOpcode op) {
+	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VHdp(MIPSOpcode op) {
+	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
 		DISABLE;
 	}
 
 	static const float MEMORY_ALIGNED16(vavg_table[4]) = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
 
-	void IRJit::Comp_Vhoriz(MIPSOpcode op) {
+	void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VDot(MIPSOpcode op) {
+	void IRFrontend::Comp_VDot(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VecDo3(MIPSOpcode op) {
+	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VV2Op(MIPSOpcode op) {
+	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
 		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
 		if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
@@ -254,19 +254,19 @@ namespace MIPSComp {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vi2f(MIPSOpcode op) {
+	void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vh2f(MIPSOpcode op) {
+	void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vf2i(MIPSOpcode op) {
+	void IRFrontend::Comp_Vf2i(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Mftv(MIPSOpcode op) {
+	void IRFrontend::Comp_Mftv(MIPSOpcode op) {
 		int imm = op & 0xFF;
 		MIPSGPReg rt = _RT;
 		switch ((op >> 21) & 0x1f) {
@@ -275,7 +275,6 @@ namespace MIPSComp {
 			if (rt != 0) {
 				if (imm < 128) {  //R(rt) = VI(imm);
 					ir.Write(IROp::VMovToGPR, rt, imm);
-					logBlocks = 1;
 				} else {
 					DISABLE;
 				}
@@ -285,7 +284,6 @@ namespace MIPSComp {
 		case 7: // mtv
 			if (imm < 128) {
 				ir.Write(IROp::VMovFromGPR, imm, rt);
-				logBlocks = 1;
 			} else {
 				DISABLE;
 			}
@@ -296,93 +294,93 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRJit::Comp_Vmfvc(MIPSOpcode op) {
+	void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vmtvc(MIPSOpcode op) {
+	void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vmmov(MIPSOpcode op) {
+	void IRFrontend::Comp_Vmmov(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VScl(MIPSOpcode op) {
+	void IRFrontend::Comp_VScl(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vmmul(MIPSOpcode op) {
+	void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vmscl(MIPSOpcode op) {
+	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vtfm(MIPSOpcode op) {
+	void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VCrs(MIPSOpcode op) {
+	void IRFrontend::Comp_VCrs(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VDet(MIPSOpcode op) {
+	void IRFrontend::Comp_VDet(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vi2x(MIPSOpcode op) {
+	void IRFrontend::Comp_Vi2x(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vx2i(MIPSOpcode op) {
+	void IRFrontend::Comp_Vx2i(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_VCrossQuat(MIPSOpcode op) {
+	void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vcmp(MIPSOpcode op) {
+	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vcmov(MIPSOpcode op) {
+	void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Viim(MIPSOpcode op) {
+	void IRFrontend::Comp_Viim(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vfim(MIPSOpcode op) {
+	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vcst(MIPSOpcode op) {
+	void IRFrontend::Comp_Vcst(MIPSOpcode op) {
 		DISABLE;
 	}
 
 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
 	// calling the math library.
-	void IRJit::Comp_VRot(MIPSOpcode op) {
+	void IRFrontend::Comp_VRot(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vsgn(MIPSOpcode op) {
+	void IRFrontend::Comp_Vsgn(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vocp(MIPSOpcode op) {
+	void IRFrontend::Comp_Vocp(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_ColorConv(MIPSOpcode op) {
+	void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
 		DISABLE;
 	}
 
-	void IRJit::Comp_Vbfy(MIPSOpcode op) {
+	void IRFrontend::Comp_Vbfy(MIPSOpcode op) {
 		DISABLE;
 	}
 }
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index ee5f5c8206b9..9ed4f2eb70de 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -42,11 +42,7 @@
 
 namespace MIPSComp {
 
-IRJit::IRJit(MIPSState *mips) : mips_(mips) { 
-	logBlocks = 0;
-	dontLogBlocks = 0;
-	js.startDefaultPrefix = mips_->HasDefaultPrefix();
-	js.currentRoundingFunc = convertS0ToSCRATCH1[0];
+IRJit::IRJit(MIPSState *mips) : mips_(mips), frontend_(mips->HasDefaultPrefix()) {
 	u32 size = 128 * 1024;
 	// blTrampolines_ = kernelMemory.Alloc(size, true, "trampoline");
 	InitIR();
@@ -55,7 +51,14 @@ IRJit::IRJit(MIPSState *mips) : mips_(mips) {
 IRJit::~IRJit() {
 }
 
-void IRJit::DoState(PointerWrap &p) {
+IRFrontend::IRFrontend(bool startDefaultPrefix) {
+	logBlocks = 0;
+	dontLogBlocks = 0;
+	js.startDefaultPrefix = startDefaultPrefix;
+	// js.currentRoundingFunc = convertS0ToSCRATCH1[0];
+}
+
+void IRFrontend::DoState(PointerWrap &p) {
 	auto s = p.Section("Jit", 1, 2);
 	if (!s)
 		return;
@@ -67,10 +70,10 @@ void IRJit::DoState(PointerWrap &p) {
 	} else {
 		js.hasSetRounding = 1;
 	}
+}
 
-	if (p.GetMode() == PointerWrap::MODE_READ) {
-		js.currentRoundingFunc = convertS0ToSCRATCH1[(mips_->fcr31) & 3];
-	}
+void IRJit::DoState(PointerWrap &p) {
+	frontend_.DoState(p);
 }
 
 // This is here so the savestate matches between jit and non-jit.
@@ -87,11 +90,11 @@ void IRJit::DoDummyState(PointerWrap &p) {
 	}
 }
 
-void IRJit::FlushAll() {
+void IRFrontend::FlushAll() {
 	FlushPrefixV();
 }
 
-void IRJit::FlushPrefixV() {
+void IRFrontend::FlushPrefixV() {
 	if ((js.prefixSFlag & JitState::PREFIX_DIRTY) != 0) {
 		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_SPREFIX, ir.AddConstant(js.prefixS));
 		js.prefixSFlag = (JitState::PrefixState) (js.prefixSFlag & ~JitState::PREFIX_DIRTY);
@@ -121,7 +124,7 @@ void IRJit::InvalidateCacheAt(u32 em_address, int length) {
 	blocks_.InvalidateICache(em_address, length);
 }
 
-void IRJit::EatInstruction(MIPSOpcode op) {
+void IRFrontend::EatInstruction(MIPSOpcode op) {
 	MIPSInfo info = MIPSGetInfo(op);
 	if (info & DELAYSLOT) {
 		ERROR_LOG_REPORT_ONCE(ateDelaySlot, JIT, "Ate a branch op.");
@@ -135,23 +138,15 @@ void IRJit::EatInstruction(MIPSOpcode op) {
 	js.downcountAmount += MIPSGetInstructionCycleEstimate(op);
 }
 
-void IRJit::CompileDelaySlot() {
+void IRFrontend::CompileDelaySlot() {
 	js.inDelaySlot = true;
 	MIPSOpcode op = GetOffsetInstruction(1);
 	MIPSCompileOp(op, this);
 	js.inDelaySlot = false;
 }
 
-void IRJit::Compile(u32 em_address) {
-	PROFILE_THIS_SCOPE("jitc");
-
-	int block_num = blocks_.AllocateBlock(em_address);
-	IRBlock *b = blocks_.GetBlock(block_num);
-	DoJit(em_address, b);
-	b->Finalize(block_num);  // Overwrites the first instruction
-
+bool IRFrontend::CheckRounding() {
 	bool cleanSlate = false;
-
 	if (js.hasSetRounding && !js.lastSetRounding) {
 		WARN_LOG(JIT, "Detected rounding mode usage, rebuilding jit with checks");
 		// Won't loop, since hasSetRounding is only ever set to 1.
@@ -161,16 +156,27 @@ void IRJit::Compile(u32 em_address) {
 
 	// Drat.  The VFPU hit an uneaten prefix at the end of a block.
 	if (js.startDefaultPrefix && js.MayHavePrefix()) {
-		WARN_LOG(JIT, "An uneaten prefix at end of block: %08x", GetCompilerPC() - 4);
+		WARN_LOG(JIT, "An uneaten prefix at end of block");
 		js.LogPrefix();
 
 		// Let's try that one more time.  We won't get back here because we toggled the value.
 		js.startDefaultPrefix = false;
-		// TODO ARM64: This crashes.
-		//cleanSlate = true;
+		// TODO: Make sure this works.
+		// cleanSlate = true;
 	}
 
-	if (cleanSlate) {
+	return cleanSlate;
+}
+
+void IRJit::Compile(u32 em_address) {
+	PROFILE_THIS_SCOPE("jitc");
+
+	int block_num = blocks_.AllocateBlock(em_address);
+	IRBlock *b = blocks_.GetBlock(block_num);
+	frontend_.DoJit(em_address, b);
+	b->Finalize(block_num);  // Overwrites the first instruction
+
+	if (frontend_.CheckRounding()) {
 		// Our assumptions are all wrong so it's clean-slate time.
 		ClearCache();
 		Compile(em_address);
@@ -208,18 +214,18 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 	// RestoreRoundingMode(true);
 }
 
-u32 IRJit::GetCompilerPC() {
+u32 IRFrontend::GetCompilerPC() {
 	return js.compilerPC;
 }
 
-MIPSOpcode IRJit::GetOffsetInstruction(int offset) {
+MIPSOpcode IRFrontend::GetOffsetInstruction(int offset) {
 	return Memory::Read_Instruction(GetCompilerPC() + 4 * offset);
 }
 
-void IRJit::DoJit(u32 em_address, IRBlock *b) {
+void IRFrontend::DoJit(u32 em_address, IRBlock *b) {
 	js.cancel = false;
-	js.blockStart = mips_->pc;
-	js.compilerPC = mips_->pc;
+	js.blockStart = em_address;
+	js.compilerPC = em_address;
 	js.lastContinuedPC = 0;
 	js.initialBlockSize = 0;
 	js.nextExit = 0;
@@ -262,7 +268,7 @@ void IRJit::DoJit(u32 em_address, IRBlock *b) {
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		char temp2[256];
-		ILOG("=============== mips %d %08x ===============", blocks_.GetNumBlocks(), em_address);
+		ILOG("=============== mips %08x ===============", em_address);
 		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
 			temp2[0] = 0;
 			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp2, true);
@@ -301,7 +307,7 @@ bool IRJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 	return false;
 }
 
-void IRJit::Comp_RunBlock(MIPSOpcode op) {
+void IRFrontend::Comp_RunBlock(MIPSOpcode op) {
 	// This shouldn't be necessary, the dispatcher should catch us before we get here.
 	ERROR_LOG(JIT, "Comp_RunBlock should never be reached!");
 }
@@ -319,7 +325,7 @@ bool IRJit::ReplaceJalTo(u32 dest) {
 	return false;
 }
 
-void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
+void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
 	int index = op.encoding & MIPS_EMUHACK_VALUE_MASK;
 
 	const ReplacementTableEntry *entry = GetReplacementFunc(index);
@@ -351,7 +357,7 @@ void IRJit::Comp_ReplacementFunc(MIPSOpcode op) {
 	}
 } 
 
-void IRJit::Comp_Generic(MIPSOpcode op) {
+void IRFrontend::Comp_Generic(MIPSOpcode op) {
 	FlushAll();
 	ir.Write(IROp::Interpret, 0, ir.AddConstant(op.encoding));
 	const MIPSInfo info = MIPSGetInfo(op);
@@ -363,7 +369,7 @@ void IRJit::Comp_Generic(MIPSOpcode op) {
 }
 
 // Destroys SCRATCH2
-void IRJit::RestoreRoundingMode(bool force) {
+void IRFrontend::RestoreRoundingMode(bool force) {
 	// If the game has never set an interesting rounding mode, we can safely skip this.
 	if (force || js.hasSetRounding) {
 		ir.Write(IROp::RestoreRoundingMode);
@@ -371,7 +377,7 @@ void IRJit::RestoreRoundingMode(bool force) {
 }
 
 // Destroys SCRATCH1 and SCRATCH2
-void IRJit::ApplyRoundingMode(bool force) {
+void IRFrontend::ApplyRoundingMode(bool force) {
 	// If the game has never set an interesting rounding mode, we can safely skip this.
 	if (force || js.hasSetRounding) {
 		ir.Write(IROp::ApplyRoundingMode);
@@ -379,14 +385,14 @@ void IRJit::ApplyRoundingMode(bool force) {
 }
 
 // Destroys SCRATCH1 and SCRATCH2
-void IRJit::UpdateRoundingMode() {
+void IRFrontend::UpdateRoundingMode() {
 	ir.Write(IROp::UpdateRoundingMode);
 }
 
-void IRJit::Comp_DoNothing(MIPSOpcode op) { 
+void IRFrontend::Comp_DoNothing(MIPSOpcode op) { 
 }
 
-int IRJit::Replace_fabsf() {
+int IRFrontend::Replace_fabsf() {
 	Crash();
 	return 0;
 }
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 76a27d1a4bef..3947136ea93f 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -101,28 +101,11 @@ class IRBlockCache {
 	std::vector<IRBlock> blocks_;
 };
 
-class IRJit : public JitInterface, public MIPSFrontendInterface{
+class IRFrontend : public MIPSFrontendInterface {
 public:
-	IRJit(MIPSState *mips);
-	virtual ~IRJit();
-
-	void DoState(PointerWrap &p) override;
-	void DoDummyState(PointerWrap &p) override;
-
-	const JitOptions &GetJitOptions() { return jo; }
-
-	// Compiled ops should ignore delay slots
-	// the compiler will take care of them by itself
-	// OR NOT
+	IRFrontend(bool startDefaultPrefix);
 	void Comp_Generic(MIPSOpcode op) override;
 
-	void RunLoopUntil(u64 globalticks) override;
-
-	void Compile(u32 em_address) override;	// Compiles a block at current MIPS PC
-	void DoJit(u32 em_address, IRBlock *b);
-
-	bool DescribeCodePtr(const u8 *ptr, std::string &name) override;
-
 	void Comp_RunBlock(MIPSOpcode op) override;
 	void Comp_ReplacementFunc(MIPSOpcode op) override;
 
@@ -195,25 +178,17 @@ class IRJit : public JitInterface, public MIPSFrontendInterface{
 	void Comp_Vbfy(MIPSOpcode op) override;
 
 	int Replace_fabsf();
+	void DoState(PointerWrap &p);
+	bool CheckRounding();  // returns true if we need a do-over
+	void DoJit(u32 em_address, IRBlock *b);
 
-	// Not using a regular block cache.
-	JitBlockCache *GetBlockCache() override { return nullptr; }
-	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
-
-	void ClearCache();
-	void InvalidateCache();
-	void InvalidateCacheAt(u32 em_address, int length = 4);
+private:
+	void RestoreRoundingMode(bool force = false);
+	void ApplyRoundingMode(bool force = false);
+	void UpdateRoundingMode();
 
 	void EatPrefix() { js.EatPrefix(); }
 
-	const u8 *GetDispatcher() const override {
-		return dispatcher;
-	}
-
-	void LinkBlock(u8 *exitPoint, const u8 *checkedEntry) override;
-	void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) override;
-
-private:
 	void FlushAll();
 	void FlushPrefixV();
 
@@ -222,12 +197,6 @@ class IRJit : public JitInterface, public MIPSFrontendInterface{
 	void EatInstruction(MIPSOpcode op);
 	MIPSOpcode GetOffsetInstruction(int offset);
 
-	void RestoreRoundingMode(bool force = false);
-	void ApplyRoundingMode(bool force = false);
-	void UpdateRoundingMode();
-
-	bool ReplaceJalTo(u32 dest);
-
 	// Utility compilation functions
 	void BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely);
 	void BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely);
@@ -255,44 +224,55 @@ class IRJit : public JitInterface, public MIPSFrontendInterface{
 	// Utils
 	void Comp_ITypeMemLR(MIPSOpcode op, bool load);
 
-	JitOptions jo;
+	// State
 	JitState js;
-
-	IRBlockCache blocks_;
-
-	MIPSState *mips_;
+	IRWriter ir;
 
 	int dontLogBlocks;
 	int logBlocks;
+};
 
-	IRWriter ir;
+class IRJit : public JitInterface {
+public:
+	IRJit(MIPSState *mips);
+	virtual ~IRJit();
 
-	// where to write branch-likely trampolines. not used atm
-	// u32 blTrampolines_;
-	// int blTrampolineCount_;
+	void DoState(PointerWrap &p) override;
+	void DoDummyState(PointerWrap &p) override;
 
-public:
-	// Code pointers
-	const u8 *enterDispatcher;
+	const JitOptions &GetJitOptions() { return jo; }
 
-	const u8 *outerLoop;
-	const u8 *outerLoopPCInSCRATCH1;
-	const u8 *dispatcherCheckCoreState;
-	const u8 *dispatcherPCInSCRATCH1;
-	const u8 *dispatcher;
-	const u8 *dispatcherNoCheck;
+	void RunLoopUntil(u64 globalticks) override;
 
-	const u8 *breakpointBailout;
+	void Compile(u32 em_address) override;	// Compiles a block at current MIPS PC
 
-	const u8 *saveStaticRegisters;
-	const u8 *loadStaticRegisters;
+	bool DescribeCodePtr(const u8 *ptr, std::string &name) override;
+	// Not using a regular block cache.
+	JitBlockCache *GetBlockCache() override { return nullptr; }
+	MIPSOpcode GetOriginalOp(MIPSOpcode op) override;
+
+	void ClearCache();
+	void InvalidateCache();
+	void InvalidateCacheAt(u32 em_address, int length = 4);
+
+	const u8 *GetDispatcher() const override { return nullptr; }
+
+	void LinkBlock(u8 *exitPoint, const u8 *checkedEntry) override;
+	void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) override;
+
+private:
+	bool ReplaceJalTo(u32 dest);
+
+	JitOptions jo;
+
+	IRFrontend frontend_;
+	IRBlockCache blocks_;
 
-	const u8 *restoreRoundingMode;
-	const u8 *applyRoundingMode;
-	const u8 *updateRoundingMode;
+	MIPSState *mips_;
 
-	// Indexed by FPCR FZ:RN bits for convenience.  Uses SCRATCH2.
-	const u8 *convertS0ToSCRATCH1[8];
+	// where to write branch-likely trampolines. not used atm
+	// u32 blTrampolines_;
+	// int blTrampolineCount_;
 };
 
 }	// namespace MIPSComp

From e711a47a7526bc2e3324be6e4158e5d2caa8fde7 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 20:05:06 +0200
Subject: [PATCH 41/77] Complete the separation of the IR compiler frontend
 from the "Jit"

---
 CMakeLists.txt              |   2 +
 Core/Core.vcxproj           |   2 +
 Core/Core.vcxproj.filters   |   6 +
 Core/MIPS/IR/IRFrontend.cpp | 288 ++++++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRFrontend.h   | 144 ++++++++++++++++++
 Core/MIPS/IR/IRJit.cpp      | 253 -------------------------------
 Core/MIPS/IR/IRJit.h        | 133 +----------------
 android/jni/Android.mk      |   1 +
 8 files changed, 444 insertions(+), 385 deletions(-)
 create mode 100644 Core/MIPS/IR/IRFrontend.cpp
 create mode 100644 Core/MIPS/IR/IRFrontend.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 655d8f7e1306..7ce5de115b88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1084,6 +1084,8 @@ set(CoreExtra ${CoreExtra}
 	Core/MIPS/IR/IRCompFPU.cpp
 	Core/MIPS/IR/IRCompLoadStore.cpp
 	Core/MIPS/IR/IRCompVFPU.cpp
+	Core/MIPS/IR/IRFrontend.cpp
+	Core/MIPS/IR/IRFrontend.h
 	Core/MIPS/IR/IRInst.cpp
 	Core/MIPS/IR/IRInst.h
 	Core/MIPS/IR/IRInterpreter.cpp
diff --git a/Core/Core.vcxproj b/Core/Core.vcxproj
index 561d83d2d36e..60d9c7c66c21 100644
--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@@ -187,6 +187,7 @@
     <ClCompile Include="MIPS\IR\IRCompFPU.cpp" />
     <ClCompile Include="MIPS\IR\IRCompLoadStore.cpp" />
     <ClCompile Include="MIPS\IR\IRCompVFPU.cpp" />
+    <ClCompile Include="MIPS\IR\IRFrontend.cpp" />
     <ClCompile Include="MIPS\IR\IRInst.cpp" />
     <ClCompile Include="MIPS\IR\IRInterpreter.cpp" />
     <ClCompile Include="MIPS\IR\IRJit.cpp" />
@@ -518,6 +519,7 @@
     <ClInclude Include="..\ext\udis86\types.h" />
     <ClInclude Include="..\ext\udis86\udint.h" />
     <ClInclude Include="..\ext\udis86\udis86.h" />
+    <ClInclude Include="MIPS\IR\IRFrontend.h" />
     <ClInclude Include="MIPS\IR\IRInst.h" />
     <ClInclude Include="MIPS\IR\IRInterpreter.h" />
     <ClInclude Include="MIPS\IR\IRJit.h" />
diff --git a/Core/Core.vcxproj.filters b/Core/Core.vcxproj.filters
index 0fc92ec2fad1..600070d1e8d4 100644
--- a/Core/Core.vcxproj.filters
+++ b/Core/Core.vcxproj.filters
@@ -670,6 +670,9 @@
     <ClCompile Include="MIPS\IR\IRInterpreter.cpp">
       <Filter>MIPS\IR</Filter>
     </ClCompile>
+    <ClCompile Include="MIPS\IR\IRFrontend.cpp">
+      <Filter>MIPS\IR</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ELF\ElfReader.h">
@@ -1230,6 +1233,9 @@
     <ClInclude Include="MIPS\IR\IRInterpreter.h">
       <Filter>MIPS\IR</Filter>
     </ClInclude>
+    <ClInclude Include="MIPS\IR\IRFrontend.h">
+      <Filter>MIPS\IR</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="CMakeLists.txt" />
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
new file mode 100644
index 000000000000..94a7ebe1056d
--- /dev/null
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -0,0 +1,288 @@
+// Copyright (c) 2012- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "base/logging.h"
+
+#include "Common/ChunkFile.h"
+#include "Core/Reporting.h"
+#include "Core/MemMap.h"
+
+#include "Core/MIPS/MIPSTables.h"
+#include "Core/HLE/ReplaceTables.h"
+
+#include "Core/MIPS/IR/IRFrontend.h"
+#include "Core/MIPS/IR/IRRegCache.h"
+#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRPassSimplify.h"
+#include "Core/MIPS/IR/IRInterpreter.h"
+
+namespace MIPSComp {
+
+IRFrontend::IRFrontend(bool startDefaultPrefix) {
+	logBlocks = 0;
+	dontLogBlocks = 0;
+	js.startDefaultPrefix = startDefaultPrefix;
+	// js.currentRoundingFunc = convertS0ToSCRATCH1[0];
+}
+
+void IRFrontend::DoState(PointerWrap &p) {
+	auto s = p.Section("Jit", 1, 2);
+	if (!s)
+		return;
+
+	p.Do(js.startDefaultPrefix);
+	if (s >= 2) {
+		p.Do(js.hasSetRounding);
+		js.lastSetRounding = 0;
+	} else {
+		js.hasSetRounding = 1;
+	}
+}
+
+void IRFrontend::FlushAll() {
+	FlushPrefixV();
+}
+
+void IRFrontend::FlushPrefixV() {
+	if ((js.prefixSFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_SPREFIX, ir.AddConstant(js.prefixS));
+		js.prefixSFlag = (JitState::PrefixState) (js.prefixSFlag & ~JitState::PREFIX_DIRTY);
+	}
+
+	if ((js.prefixTFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_TPREFIX, ir.AddConstant(js.prefixT));
+		js.prefixTFlag = (JitState::PrefixState) (js.prefixTFlag & ~JitState::PREFIX_DIRTY);
+	}
+
+	if ((js.prefixDFlag & JitState::PREFIX_DIRTY) != 0) {
+		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_DPREFIX, ir.AddConstant(js.prefixD));
+		js.prefixDFlag = (JitState::PrefixState) (js.prefixDFlag & ~JitState::PREFIX_DIRTY);
+	}
+}
+
+void IRFrontend::EatInstruction(MIPSOpcode op) {
+	MIPSInfo info = MIPSGetInfo(op);
+	if (info & DELAYSLOT) {
+		ERROR_LOG_REPORT_ONCE(ateDelaySlot, JIT, "Ate a branch op.");
+	}
+	if (js.inDelaySlot) {
+		ERROR_LOG_REPORT_ONCE(ateInDelaySlot, JIT, "Ate an instruction inside a delay slot.");
+	}
+
+	js.numInstructions++;
+	js.compilerPC += 4;
+	js.downcountAmount += MIPSGetInstructionCycleEstimate(op);
+}
+
+void IRFrontend::CompileDelaySlot() {
+	js.inDelaySlot = true;
+	MIPSOpcode op = GetOffsetInstruction(1);
+	MIPSCompileOp(op, this);
+	js.inDelaySlot = false;
+}
+
+bool IRFrontend::CheckRounding() {
+	bool cleanSlate = false;
+	if (js.hasSetRounding && !js.lastSetRounding) {
+		WARN_LOG(JIT, "Detected rounding mode usage, rebuilding jit with checks");
+		// Won't loop, since hasSetRounding is only ever set to 1.
+		js.lastSetRounding = js.hasSetRounding;
+		cleanSlate = true;
+	}
+
+	// Drat.  The VFPU hit an uneaten prefix at the end of a block.
+	if (js.startDefaultPrefix && js.MayHavePrefix()) {
+		WARN_LOG(JIT, "An uneaten prefix at end of block");
+		js.LogPrefix();
+
+		// Let's try that one more time.  We won't get back here because we toggled the value.
+		js.startDefaultPrefix = false;
+		// TODO: Make sure this works.
+		// cleanSlate = true;
+	}
+
+	return cleanSlate;
+}
+
+
+void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
+	int index = op.encoding & MIPS_EMUHACK_VALUE_MASK;
+
+	const ReplacementTableEntry *entry = GetReplacementFunc(index);
+	if (!entry) {
+		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
+		return;
+	}
+
+	if (entry->flags & REPFLAG_DISABLED) {
+		MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
+	} else if (entry->replaceFunc) {
+		FlushAll();
+		RestoreRoundingMode();
+		ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
+		ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
+
+		if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
+			// Compile the original instruction at this address.  We ignore cycles for hooks.
+			ApplyRoundingMode();
+			MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
+		} else {
+			ApplyRoundingMode();
+			ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
+			ir.Write(IROp::ExitToReg, MIPS_REG_RA, 0, 0);
+			js.compiling = false;
+		}
+	} else {
+		ERROR_LOG(HLE, "Replacement function %s has neither jit nor regular impl", entry->name);
+	}
+}
+
+void IRFrontend::Comp_Generic(MIPSOpcode op) {
+	FlushAll();
+	ir.Write(IROp::Interpret, 0, ir.AddConstant(op.encoding));
+	const MIPSInfo info = MIPSGetInfo(op);
+	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) {
+		// If it does eat them, it'll happen in MIPSCompileOp().
+		if ((info & OUT_EAT_PREFIX) == 0)
+			js.PrefixUnknown();
+	}
+}
+
+// Destroys SCRATCH2
+void IRFrontend::RestoreRoundingMode(bool force) {
+	// If the game has never set an interesting rounding mode, we can safely skip this.
+	if (force || js.hasSetRounding) {
+		ir.Write(IROp::RestoreRoundingMode);
+	}
+}
+
+// Destroys SCRATCH1 and SCRATCH2
+void IRFrontend::ApplyRoundingMode(bool force) {
+	// If the game has never set an interesting rounding mode, we can safely skip this.
+	if (force || js.hasSetRounding) {
+		ir.Write(IROp::ApplyRoundingMode);
+	}
+}
+
+// Destroys SCRATCH1 and SCRATCH2
+void IRFrontend::UpdateRoundingMode() {
+	ir.Write(IROp::UpdateRoundingMode);
+}
+
+void IRFrontend::Comp_DoNothing(MIPSOpcode op) {
+}
+
+int IRFrontend::Replace_fabsf() {
+	Crash();
+	return 0;
+}
+
+u32 IRFrontend::GetCompilerPC() {
+	return js.compilerPC;
+}
+
+MIPSOpcode IRFrontend::GetOffsetInstruction(int offset) {
+	return Memory::Read_Instruction(GetCompilerPC() + 4 * offset);
+}
+
+void IRFrontend::DoJit(u32 em_address, IRBlock *b) {
+	js.cancel = false;
+	js.blockStart = em_address;
+	js.compilerPC = em_address;
+	js.lastContinuedPC = 0;
+	js.initialBlockSize = 0;
+	js.nextExit = 0;
+	js.downcountAmount = 0;
+	js.curBlock = nullptr;
+	js.compiling = true;
+	js.inDelaySlot = false;
+	js.PrefixStart();
+	ir.Clear();
+
+	js.numInstructions = 0;
+	while (js.compiling) {
+		MIPSOpcode inst = Memory::Read_Opcode_JIT(GetCompilerPC());
+		js.downcountAmount += MIPSGetInstructionCycleEstimate(inst);
+		MIPSCompileOp(inst, this);
+		js.compilerPC += 4;
+		js.numInstructions++;
+
+		if (ir.GetConstants().size() > 64) {
+			// Need to break the block
+			ir.Write(IROp::ExitToConst, ir.AddConstant(js.compilerPC));
+			js.compiling = false;
+		}
+	}
+
+	IRWriter simplified;
+	IRWriter *code = &ir;
+	if (true) {
+		static const IRPassFunc passes[] = {
+			&PropagateConstants,
+		};
+		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
+			logBlocks = 1;
+		code = &simplified;
+		if (ir.GetInstructions().size() >= 24)
+			logBlocks = 1;
+	}
+
+	b->SetInstructions(code->GetInstructions(), code->GetConstants());
+
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		char temp2[256];
+		ILOG("=============== mips %08x ===============", em_address);
+		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
+			temp2[0] = 0;
+			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp2, true);
+			ILOG("M: %08x   %s", cpc, temp2);
+		}
+	}
+
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		ILOG("=============== Original IR (%d instructions, %d const) ===============", (int)ir.GetInstructions().size(), (int)ir.GetConstants().size());
+		for (int i = 0; i < ir.GetInstructions().size(); i++) {
+			char buf[256];
+			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
+			ILOG("%s", buf);
+		}
+		ILOG("===============        end         =================");
+	}
+
+	if (logBlocks > 0 && dontLogBlocks == 0) {
+		ILOG("=============== IR (%d instructions, %d const) ===============", (int)code->GetInstructions().size(), (int)code->GetConstants().size());
+		for (int i = 0; i < code->GetInstructions().size(); i++) {
+			char buf[256];
+			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
+			ILOG("%s", buf);
+		}
+		ILOG("===============        end         =================");
+	}
+
+	if (logBlocks > 0)
+		logBlocks--;
+	if (dontLogBlocks > 0)
+		dontLogBlocks--;
+}
+
+void IRFrontend::Comp_RunBlock(MIPSOpcode op) {
+	// This shouldn't be necessary, the dispatcher should catch us before we get here.
+	ERROR_LOG(JIT, "Comp_RunBlock should never be reached!");
+}
+
+
+}  // namespace
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
new file mode 100644
index 000000000000..c5854dffdf7e
--- /dev/null
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include "Common/CommonTypes.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"
+#include "Core/MIPS/JitCommon/JitState.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
+#include "Core/MIPS/IR/IRInst.h"
+
+namespace MIPSComp {
+
+class IRBlock;
+
+class IRFrontend : public MIPSFrontendInterface {
+public:
+	IRFrontend(bool startDefaultPrefix);
+	void Comp_Generic(MIPSOpcode op) override;
+
+	void Comp_RunBlock(MIPSOpcode op) override;
+	void Comp_ReplacementFunc(MIPSOpcode op) override;
+
+	// Ops
+	void Comp_ITypeMem(MIPSOpcode op) override;
+	void Comp_Cache(MIPSOpcode op) override;
+
+	void Comp_RelBranch(MIPSOpcode op) override;
+	void Comp_RelBranchRI(MIPSOpcode op) override;
+	void Comp_FPUBranch(MIPSOpcode op) override;
+	void Comp_FPULS(MIPSOpcode op) override;
+	void Comp_FPUComp(MIPSOpcode op) override;
+	void Comp_Jump(MIPSOpcode op) override;
+	void Comp_JumpReg(MIPSOpcode op) override;
+	void Comp_Syscall(MIPSOpcode op) override;
+	void Comp_Break(MIPSOpcode op) override;
+
+	void Comp_IType(MIPSOpcode op) override;
+	void Comp_RType2(MIPSOpcode op) override;
+	void Comp_RType3(MIPSOpcode op) override;
+	void Comp_ShiftType(MIPSOpcode op) override;
+	void Comp_Allegrex(MIPSOpcode op) override;
+	void Comp_Allegrex2(MIPSOpcode op) override;
+	void Comp_VBranch(MIPSOpcode op) override;
+	void Comp_MulDivType(MIPSOpcode op) override;
+	void Comp_Special3(MIPSOpcode op) override;
+
+	void Comp_FPU3op(MIPSOpcode op) override;
+	void Comp_FPU2op(MIPSOpcode op) override;
+	void Comp_mxc1(MIPSOpcode op) override;
+
+	void Comp_DoNothing(MIPSOpcode op) override;
+
+	void Comp_SV(MIPSOpcode op) override;
+	void Comp_SVQ(MIPSOpcode op) override;
+	void Comp_VPFX(MIPSOpcode op) override;
+	void Comp_VVectorInit(MIPSOpcode op) override;
+	void Comp_VMatrixInit(MIPSOpcode op) override;
+	void Comp_VDot(MIPSOpcode op) override;
+	void Comp_VecDo3(MIPSOpcode op) override;
+	void Comp_VV2Op(MIPSOpcode op) override;
+	void Comp_Mftv(MIPSOpcode op) override;
+	void Comp_Vmfvc(MIPSOpcode op) override;
+	void Comp_Vmtvc(MIPSOpcode op) override;
+	void Comp_Vmmov(MIPSOpcode op) override;
+	void Comp_VScl(MIPSOpcode op) override;
+	void Comp_Vmmul(MIPSOpcode op) override;
+	void Comp_Vmscl(MIPSOpcode op) override;
+	void Comp_Vtfm(MIPSOpcode op) override;
+	void Comp_VHdp(MIPSOpcode op) override;
+	void Comp_VCrs(MIPSOpcode op) override;
+	void Comp_VDet(MIPSOpcode op) override;
+	void Comp_Vi2x(MIPSOpcode op) override;
+	void Comp_Vx2i(MIPSOpcode op) override;
+	void Comp_Vf2i(MIPSOpcode op) override;
+	void Comp_Vi2f(MIPSOpcode op) override;
+	void Comp_Vh2f(MIPSOpcode op) override;
+	void Comp_Vcst(MIPSOpcode op) override;
+	void Comp_Vhoriz(MIPSOpcode op) override;
+	void Comp_VRot(MIPSOpcode op) override;
+	void Comp_VIdt(MIPSOpcode op) override;
+	void Comp_Vcmp(MIPSOpcode op) override;
+	void Comp_Vcmov(MIPSOpcode op) override;
+	void Comp_Viim(MIPSOpcode op) override;
+	void Comp_Vfim(MIPSOpcode op) override;
+	void Comp_VCrossQuat(MIPSOpcode op) override;
+	void Comp_Vsgn(MIPSOpcode op) override;
+	void Comp_Vocp(MIPSOpcode op) override;
+	void Comp_ColorConv(MIPSOpcode op) override;
+	void Comp_Vbfy(MIPSOpcode op) override;
+
+	int Replace_fabsf();
+	void DoState(PointerWrap &p);
+	bool CheckRounding();  // returns true if we need a do-over
+	void DoJit(u32 em_address, IRBlock *b);
+
+private:
+	void RestoreRoundingMode(bool force = false);
+	void ApplyRoundingMode(bool force = false);
+	void UpdateRoundingMode();
+
+	void EatPrefix() { js.EatPrefix(); }
+
+	void FlushAll();
+	void FlushPrefixV();
+
+	u32 GetCompilerPC();
+	void CompileDelaySlot();
+	void EatInstruction(MIPSOpcode op);
+	MIPSOpcode GetOffsetInstruction(int offset);
+
+	// Utility compilation functions
+	void BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely);
+	void BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely);
+	void BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely);
+	void BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely);
+
+	// Utilities to reduce duplicated code
+	void CompShiftImm(MIPSOpcode op, IROp shiftType, int sa);
+	void CompShiftVar(MIPSOpcode op, IROp shiftType, IROp shiftTypeConst);
+
+	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
+	void ApplyPrefixD(const u8 *vregs, VectorSize sz);
+	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
+		GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixS, sz);
+	}
+	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
+		GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixT, sz);
+	}
+	void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg);
+
+	// Utils
+	void Comp_ITypeMemLR(MIPSOpcode op, bool load);
+
+	// State
+	JitState js;
+	IRWriter ir;
+
+	int dontLogBlocks;
+	int logBlocks;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 9ed4f2eb70de..94ab72bfd98a 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -18,7 +18,6 @@
 #include "base/logging.h"
 #include "profiler/profiler.h"
 #include "Common/ChunkFile.h"
-#include "Common/CPUDetect.h"
 #include "Common/StringUtils.h"
 
 #include "Core/Reporting.h"
@@ -32,7 +31,6 @@
 #include "Core/MIPS/MIPSCodeUtils.h"
 #include "Core/MIPS/MIPSInt.h"
 #include "Core/MIPS/MIPSTables.h"
-#include "Core/HLE/ReplaceTables.h"
 #include "Core/HLE/sceKernelMemory.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRJit.h"
@@ -51,27 +49,6 @@ IRJit::IRJit(MIPSState *mips) : mips_(mips), frontend_(mips->HasDefaultPrefix())
 IRJit::~IRJit() {
 }
 
-IRFrontend::IRFrontend(bool startDefaultPrefix) {
-	logBlocks = 0;
-	dontLogBlocks = 0;
-	js.startDefaultPrefix = startDefaultPrefix;
-	// js.currentRoundingFunc = convertS0ToSCRATCH1[0];
-}
-
-void IRFrontend::DoState(PointerWrap &p) {
-	auto s = p.Section("Jit", 1, 2);
-	if (!s)
-		return;
-
-	p.Do(js.startDefaultPrefix);
-	if (s >= 2) {
-		p.Do(js.hasSetRounding);
-		js.lastSetRounding = 0;
-	} else {
-		js.hasSetRounding = 1;
-	}
-}
-
 void IRJit::DoState(PointerWrap &p) {
 	frontend_.DoState(p);
 }
@@ -90,27 +67,6 @@ void IRJit::DoDummyState(PointerWrap &p) {
 	}
 }
 
-void IRFrontend::FlushAll() {
-	FlushPrefixV();
-}
-
-void IRFrontend::FlushPrefixV() {
-	if ((js.prefixSFlag & JitState::PREFIX_DIRTY) != 0) {
-		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_SPREFIX, ir.AddConstant(js.prefixS));
-		js.prefixSFlag = (JitState::PrefixState) (js.prefixSFlag & ~JitState::PREFIX_DIRTY);
-	}
-
-	if ((js.prefixTFlag & JitState::PREFIX_DIRTY) != 0) {
-		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_TPREFIX, ir.AddConstant(js.prefixT));
-		js.prefixTFlag = (JitState::PrefixState) (js.prefixTFlag & ~JitState::PREFIX_DIRTY);
-	}
-
-	if ((js.prefixDFlag & JitState::PREFIX_DIRTY) != 0) {
-		ir.Write(IROp::SetCtrlVFPU, VFPU_CTRL_DPREFIX, ir.AddConstant(js.prefixD));
-		js.prefixDFlag = (JitState::PrefixState) (js.prefixDFlag & ~JitState::PREFIX_DIRTY);
-	}
-}
-
 void IRJit::ClearCache() {
 	ILOG("IRJit: Clearing the cache!");
 	blocks_.Clear();
@@ -124,50 +80,6 @@ void IRJit::InvalidateCacheAt(u32 em_address, int length) {
 	blocks_.InvalidateICache(em_address, length);
 }
 
-void IRFrontend::EatInstruction(MIPSOpcode op) {
-	MIPSInfo info = MIPSGetInfo(op);
-	if (info & DELAYSLOT) {
-		ERROR_LOG_REPORT_ONCE(ateDelaySlot, JIT, "Ate a branch op.");
-	}
-	if (js.inDelaySlot) {
-		ERROR_LOG_REPORT_ONCE(ateInDelaySlot, JIT, "Ate an instruction inside a delay slot.");
-	}
-
-	js.numInstructions++;
-	js.compilerPC += 4;
-	js.downcountAmount += MIPSGetInstructionCycleEstimate(op);
-}
-
-void IRFrontend::CompileDelaySlot() {
-	js.inDelaySlot = true;
-	MIPSOpcode op = GetOffsetInstruction(1);
-	MIPSCompileOp(op, this);
-	js.inDelaySlot = false;
-}
-
-bool IRFrontend::CheckRounding() {
-	bool cleanSlate = false;
-	if (js.hasSetRounding && !js.lastSetRounding) {
-		WARN_LOG(JIT, "Detected rounding mode usage, rebuilding jit with checks");
-		// Won't loop, since hasSetRounding is only ever set to 1.
-		js.lastSetRounding = js.hasSetRounding;
-		cleanSlate = true;
-	}
-
-	// Drat.  The VFPU hit an uneaten prefix at the end of a block.
-	if (js.startDefaultPrefix && js.MayHavePrefix()) {
-		WARN_LOG(JIT, "An uneaten prefix at end of block");
-		js.LogPrefix();
-
-		// Let's try that one more time.  We won't get back here because we toggled the value.
-		js.startDefaultPrefix = false;
-		// TODO: Make sure this works.
-		// cleanSlate = true;
-	}
-
-	return cleanSlate;
-}
-
 void IRJit::Compile(u32 em_address) {
 	PROFILE_THIS_SCOPE("jitc");
 
@@ -214,104 +126,11 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 	// RestoreRoundingMode(true);
 }
 
-u32 IRFrontend::GetCompilerPC() {
-	return js.compilerPC;
-}
-
-MIPSOpcode IRFrontend::GetOffsetInstruction(int offset) {
-	return Memory::Read_Instruction(GetCompilerPC() + 4 * offset);
-}
-
-void IRFrontend::DoJit(u32 em_address, IRBlock *b) {
-	js.cancel = false;
-	js.blockStart = em_address;
-	js.compilerPC = em_address;
-	js.lastContinuedPC = 0;
-	js.initialBlockSize = 0;
-	js.nextExit = 0;
-	js.downcountAmount = 0;
-	js.curBlock = nullptr;
-	js.compiling = true;
-	js.inDelaySlot = false;
-	js.PrefixStart();
-	ir.Clear();
-
-	js.numInstructions = 0;
-	while (js.compiling) {
-		MIPSOpcode inst = Memory::Read_Opcode_JIT(GetCompilerPC());
-		js.downcountAmount += MIPSGetInstructionCycleEstimate(inst);
-		MIPSCompileOp(inst, this);
-		js.compilerPC += 4;
-		js.numInstructions++;
-
-		if (ir.GetConstants().size() > 64) {
-			// Need to break the block
-			ir.Write(IROp::ExitToConst, ir.AddConstant(js.compilerPC));
-			js.compiling = false;
-		}
-	}
-
-	IRWriter simplified;
-	IRWriter *code = &ir;
-	if (true) {
-		static const IRPassFunc passes[] = {
-			&PropagateConstants,
-		};
-		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
-			logBlocks = 1;
-		code = &simplified;
-		if (ir.GetInstructions().size() >= 24)
-			logBlocks = 1;
-	}
-
-	b->SetInstructions(code->GetInstructions(), code->GetConstants());
-
-	if (logBlocks > 0 && dontLogBlocks == 0) {
-		char temp2[256];
-		ILOG("=============== mips %08x ===============", em_address);
-		for (u32 cpc = em_address; cpc != GetCompilerPC() + 4; cpc += 4) {
-			temp2[0] = 0;
-			MIPSDisAsm(Memory::Read_Opcode_JIT(cpc), cpc, temp2, true);
-			ILOG("M: %08x   %s", cpc, temp2);
-		}
-	}
-
-	if (logBlocks > 0 && dontLogBlocks == 0) {
-		ILOG("=============== Original IR (%d instructions, %d const) ===============", (int)ir.GetInstructions().size(), (int)ir.GetConstants().size());
-		for (int i = 0; i < ir.GetInstructions().size(); i++) {
-			char buf[256];
-			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
-			ILOG("%s", buf);
-		}
-		ILOG("===============        end         =================");
-	}
-
-	if (logBlocks > 0 && dontLogBlocks == 0) {
-		ILOG("=============== IR (%d instructions, %d const) ===============", (int)code->GetInstructions().size(), (int)code->GetConstants().size());
-		for (int i = 0; i < code->GetInstructions().size(); i++) {
-			char buf[256];
-			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
-			ILOG("%s", buf);
-		}
-		ILOG("===============        end         =================");
-	}
-
-	if (logBlocks > 0)
-		logBlocks--;
-	if (dontLogBlocks > 0)
-		dontLogBlocks--;
-}
-
 bool IRJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 	// Used in disassembly viewer.
 	return false;
 }
 
-void IRFrontend::Comp_RunBlock(MIPSOpcode op) {
-	// This shouldn't be necessary, the dispatcher should catch us before we get here.
-	ERROR_LOG(JIT, "Comp_RunBlock should never be reached!");
-}
-
 void IRJit::LinkBlock(u8 *exitPoint, const u8 *checkedEntry) {
 	Crash();
 }
@@ -325,78 +144,6 @@ bool IRJit::ReplaceJalTo(u32 dest) {
 	return false;
 }
 
-void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
-	int index = op.encoding & MIPS_EMUHACK_VALUE_MASK;
-
-	const ReplacementTableEntry *entry = GetReplacementFunc(index);
-	if (!entry) {
-		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
-		return;
-	}
-
-	if (entry->flags & REPFLAG_DISABLED) {
-		MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
-	} else if (entry->replaceFunc) {
-		FlushAll();
-		RestoreRoundingMode();
-		ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
-		ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
-
-		if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
-			// Compile the original instruction at this address.  We ignore cycles for hooks.
-			ApplyRoundingMode();
-			MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
-		} else {
-			ApplyRoundingMode();
-			ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
-			ir.Write(IROp::ExitToReg, MIPS_REG_RA, 0, 0);
-			js.compiling = false;
-		}
-	} else {
-		ERROR_LOG(HLE, "Replacement function %s has neither jit nor regular impl", entry->name);
-	}
-} 
-
-void IRFrontend::Comp_Generic(MIPSOpcode op) {
-	FlushAll();
-	ir.Write(IROp::Interpret, 0, ir.AddConstant(op.encoding));
-	const MIPSInfo info = MIPSGetInfo(op);
-	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) {
-		// If it does eat them, it'll happen in MIPSCompileOp().
-		if ((info & OUT_EAT_PREFIX) == 0)
-			js.PrefixUnknown();
-	}
-}
-
-// Destroys SCRATCH2
-void IRFrontend::RestoreRoundingMode(bool force) {
-	// If the game has never set an interesting rounding mode, we can safely skip this.
-	if (force || js.hasSetRounding) {
-		ir.Write(IROp::RestoreRoundingMode);
-	}
-}
-
-// Destroys SCRATCH1 and SCRATCH2
-void IRFrontend::ApplyRoundingMode(bool force) {
-	// If the game has never set an interesting rounding mode, we can safely skip this.
-	if (force || js.hasSetRounding) {
-		ir.Write(IROp::ApplyRoundingMode);
-	}
-}
-
-// Destroys SCRATCH1 and SCRATCH2
-void IRFrontend::UpdateRoundingMode() {
-	ir.Write(IROp::UpdateRoundingMode);
-}
-
-void IRFrontend::Comp_DoNothing(MIPSOpcode op) { 
-}
-
-int IRFrontend::Replace_fabsf() {
-	Crash();
-	return 0;
-}
-
 void IRBlockCache::Clear() {
 	blocks_.clear();
 }
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 3947136ea93f..aa026b0bd8d5 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -20,11 +20,11 @@
 #include <cstring>
 
 #include "Common/CPUDetect.h"
-#include "Core/MIPS/JitCommon/JitState.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Core/MIPS/IR/IRInst.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"
 
 #ifndef offsetof
@@ -101,137 +101,6 @@ class IRBlockCache {
 	std::vector<IRBlock> blocks_;
 };
 
-class IRFrontend : public MIPSFrontendInterface {
-public:
-	IRFrontend(bool startDefaultPrefix);
-	void Comp_Generic(MIPSOpcode op) override;
-
-	void Comp_RunBlock(MIPSOpcode op) override;
-	void Comp_ReplacementFunc(MIPSOpcode op) override;
-
-	// Ops
-	void Comp_ITypeMem(MIPSOpcode op) override;
-	void Comp_Cache(MIPSOpcode op) override;
-
-	void Comp_RelBranch(MIPSOpcode op) override;
-	void Comp_RelBranchRI(MIPSOpcode op) override;
-	void Comp_FPUBranch(MIPSOpcode op) override;
-	void Comp_FPULS(MIPSOpcode op) override;
-	void Comp_FPUComp(MIPSOpcode op) override;
-	void Comp_Jump(MIPSOpcode op) override;
-	void Comp_JumpReg(MIPSOpcode op) override;
-	void Comp_Syscall(MIPSOpcode op) override;
-	void Comp_Break(MIPSOpcode op) override;
-
-	void Comp_IType(MIPSOpcode op) override;
-	void Comp_RType2(MIPSOpcode op) override;
-	void Comp_RType3(MIPSOpcode op) override;
-	void Comp_ShiftType(MIPSOpcode op) override;
-	void Comp_Allegrex(MIPSOpcode op) override;
-	void Comp_Allegrex2(MIPSOpcode op) override;
-	void Comp_VBranch(MIPSOpcode op) override;
-	void Comp_MulDivType(MIPSOpcode op) override;
-	void Comp_Special3(MIPSOpcode op) override;
-
-	void Comp_FPU3op(MIPSOpcode op) override;
-	void Comp_FPU2op(MIPSOpcode op) override;
-	void Comp_mxc1(MIPSOpcode op) override;
-
-	void Comp_DoNothing(MIPSOpcode op) override;
-
-	void Comp_SV(MIPSOpcode op) override;
-	void Comp_SVQ(MIPSOpcode op) override;
-	void Comp_VPFX(MIPSOpcode op) override;
-	void Comp_VVectorInit(MIPSOpcode op) override;
-	void Comp_VMatrixInit(MIPSOpcode op) override;
-	void Comp_VDot(MIPSOpcode op) override;
-	void Comp_VecDo3(MIPSOpcode op) override;
-	void Comp_VV2Op(MIPSOpcode op) override;
-	void Comp_Mftv(MIPSOpcode op) override;
-	void Comp_Vmfvc(MIPSOpcode op) override;
-	void Comp_Vmtvc(MIPSOpcode op) override;
-	void Comp_Vmmov(MIPSOpcode op) override;
-	void Comp_VScl(MIPSOpcode op) override;
-	void Comp_Vmmul(MIPSOpcode op) override;
-	void Comp_Vmscl(MIPSOpcode op) override;
-	void Comp_Vtfm(MIPSOpcode op) override;
-	void Comp_VHdp(MIPSOpcode op) override;
-	void Comp_VCrs(MIPSOpcode op) override;
-	void Comp_VDet(MIPSOpcode op) override;
-	void Comp_Vi2x(MIPSOpcode op) override;
-	void Comp_Vx2i(MIPSOpcode op) override;
-	void Comp_Vf2i(MIPSOpcode op) override;
-	void Comp_Vi2f(MIPSOpcode op) override;
-	void Comp_Vh2f(MIPSOpcode op) override;
-	void Comp_Vcst(MIPSOpcode op) override;
-	void Comp_Vhoriz(MIPSOpcode op) override;
-	void Comp_VRot(MIPSOpcode op) override;
-	void Comp_VIdt(MIPSOpcode op) override;
-	void Comp_Vcmp(MIPSOpcode op) override;
-	void Comp_Vcmov(MIPSOpcode op) override;
-	void Comp_Viim(MIPSOpcode op) override;
-	void Comp_Vfim(MIPSOpcode op) override;
-	void Comp_VCrossQuat(MIPSOpcode op) override;
-	void Comp_Vsgn(MIPSOpcode op) override;
-	void Comp_Vocp(MIPSOpcode op) override;
-	void Comp_ColorConv(MIPSOpcode op) override;
-	void Comp_Vbfy(MIPSOpcode op) override;
-
-	int Replace_fabsf();
-	void DoState(PointerWrap &p);
-	bool CheckRounding();  // returns true if we need a do-over
-	void DoJit(u32 em_address, IRBlock *b);
-
-private:
-	void RestoreRoundingMode(bool force = false);
-	void ApplyRoundingMode(bool force = false);
-	void UpdateRoundingMode();
-
-	void EatPrefix() { js.EatPrefix(); }
-
-	void FlushAll();
-	void FlushPrefixV();
-
-	u32 GetCompilerPC();
-	void CompileDelaySlot();
-	void EatInstruction(MIPSOpcode op);
-	MIPSOpcode GetOffsetInstruction(int offset);
-
-	// Utility compilation functions
-	void BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely);
-	void BranchVFPUFlag(MIPSOpcode op, IRComparison cc, bool likely);
-	void BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink, bool likely);
-	void BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely);
-
-	// Utilities to reduce duplicated code
-	void CompShiftImm(MIPSOpcode op, IROp shiftType, int sa);
-	void CompShiftVar(MIPSOpcode op, IROp shiftType, IROp shiftTypeConst);
-
-	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
-	void ApplyPrefixD(const u8 *vregs, VectorSize sz);
-	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
-		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
-		GetVectorRegs(regs, sz, vectorReg);
-		ApplyPrefixST(regs, js.prefixS, sz);
-	}
-	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
-		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
-		GetVectorRegs(regs, sz, vectorReg);
-		ApplyPrefixST(regs, js.prefixT, sz);
-	}
-	void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg);
-
-	// Utils
-	void Comp_ITypeMemLR(MIPSOpcode op, bool load);
-
-	// State
-	JitState js;
-	IRWriter ir;
-
-	int dontLogBlocks;
-	int logBlocks;
-};
-
 class IRJit : public JitInterface {
 public:
 	IRJit(MIPSState *mips);
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 92a10e800c63..115b96997252 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -157,6 +157,7 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/Core/MIPS/MIPSVFPUUtils.cpp.arm \
   $(SRC)/Core/MIPS/MIPSCodeUtils.cpp.arm \
   $(SRC)/Core/MIPS/MIPSDebugInterface.cpp \
+  $(SRC)/Core/MIPS/IR/IRFrontend.cpp \
   $(SRC)/Core/MIPS/IR/IRJit.cpp \
   $(SRC)/Core/MIPS/IR/IRCompALU.cpp \
   $(SRC)/Core/MIPS/IR/IRCompBranch.cpp \

From 28087a6088c41c5cb7a9f56515f59b8ae38a741b Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 20:18:22 +0200
Subject: [PATCH 42/77] IRFrontend shouldn't know about IRBlock

---
 Core/MIPS/IR/IRFrontend.cpp | 5 +++--
 Core/MIPS/IR/IRFrontend.h   | 5 ++---
 Core/MIPS/IR/IRJit.cpp      | 6 +++++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 94a7ebe1056d..bf8e5d573a01 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -199,7 +199,7 @@ MIPSOpcode IRFrontend::GetOffsetInstruction(int offset) {
 	return Memory::Read_Instruction(GetCompilerPC() + 4 * offset);
 }
 
-void IRFrontend::DoJit(u32 em_address, IRBlock *b) {
+void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::vector<u32> &constants) {
 	js.cancel = false;
 	js.blockStart = em_address;
 	js.compilerPC = em_address;
@@ -241,7 +241,8 @@ void IRFrontend::DoJit(u32 em_address, IRBlock *b) {
 			logBlocks = 1;
 	}
 
-	b->SetInstructions(code->GetInstructions(), code->GetConstants());
+	instructions = code->GetInstructions();
+	constants = code->GetConstants();
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		char temp2[256];
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
index c5854dffdf7e..9b8db0c76a04 100644
--- a/Core/MIPS/IR/IRFrontend.h
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -8,8 +8,6 @@
 
 namespace MIPSComp {
 
-class IRBlock;
-
 class IRFrontend : public MIPSFrontendInterface {
 public:
 	IRFrontend(bool startDefaultPrefix);
@@ -89,7 +87,8 @@ class IRFrontend : public MIPSFrontendInterface {
 	int Replace_fabsf();
 	void DoState(PointerWrap &p);
 	bool CheckRounding();  // returns true if we need a do-over
-	void DoJit(u32 em_address, IRBlock *b);
+
+	void DoJit(u32 em_address, std::vector<IRInst> &instructions, std::vector<u32> &constants);
 
 private:
 	void RestoreRoundingMode(bool force = false);
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 94ab72bfd98a..661543a9d0df 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -85,7 +85,11 @@ void IRJit::Compile(u32 em_address) {
 
 	int block_num = blocks_.AllocateBlock(em_address);
 	IRBlock *b = blocks_.GetBlock(block_num);
-	frontend_.DoJit(em_address, b);
+
+	std::vector<IRInst> instructions;
+	std::vector<u32> constants;
+	frontend_.DoJit(em_address, instructions, constants);
+	b->SetInstructions(instructions, constants);
 	b->Finalize(block_num);  // Overwrites the first instruction
 
 	if (frontend_.CheckRounding()) {

From a5d5c5ce2b589e4beb08f3ca61daad1e41bc10d5 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 22:40:59 +0200
Subject: [PATCH 43/77] Do the voffset remapping before the IR. This will let
 us easily add some virtual VFPU registers for the IR to the end, plus it's
 slightly faster.

---
 Core/MIPS/IR/IRCompVFPU.cpp    | 20 ++++++++++----------
 Core/MIPS/IR/IRFrontend.cpp    |  1 -
 Core/MIPS/IR/IRInterpreter.cpp | 10 +++++-----
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index e6f5ca3a8757..a2a3295c6e27 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -192,20 +192,20 @@ namespace MIPSComp {
 		case 54: //lv.q
 		{
 			// TODO: Add vector load/store instruction to the IR
-			ir.Write(IROp::LoadFloatV, vregs[0], rs, ir.AddConstant(imm));
-			ir.Write(IROp::LoadFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::LoadFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::LoadFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			ir.Write(IROp::LoadFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
+			ir.Write(IROp::LoadFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
+			ir.Write(IROp::LoadFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
+			ir.Write(IROp::LoadFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
 		}
 		break;
 
 		case 62: //sv.q
 		{
 			// CC might be set by slow path below, so load regs first.
-			ir.Write(IROp::StoreFloatV, vregs[0], rs, ir.AddConstant(imm));
-			ir.Write(IROp::StoreFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::StoreFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::StoreFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			ir.Write(IROp::StoreFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
+			ir.Write(IROp::StoreFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
+			ir.Write(IROp::StoreFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
+			ir.Write(IROp::StoreFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
 		}
 		break;
 
@@ -274,7 +274,7 @@ namespace MIPSComp {
 						// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
 			if (rt != 0) {
 				if (imm < 128) {  //R(rt) = VI(imm);
-					ir.Write(IROp::VMovToGPR, rt, imm);
+					ir.Write(IROp::VMovToGPR, rt, voffset[imm]);
 				} else {
 					DISABLE;
 				}
@@ -283,7 +283,7 @@ namespace MIPSComp {
 
 		case 7: // mtv
 			if (imm < 128) {
-				ir.Write(IROp::VMovFromGPR, imm, rt);
+				ir.Write(IROp::VMovFromGPR, voffset[imm], rt);
 			} else {
 				DISABLE;
 			}
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index bf8e5d573a01..95ce9d9c4b02 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -26,7 +26,6 @@
 
 #include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"
-#include "Core/MIPS/IR/IRJit.h"
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/IR/IRInterpreter.h"
 
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 03b06c77e129..54f20edb4a76 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -21,7 +21,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
 			break;
 		case IROp::SetConstV:
-			memcpy(&mips->v[voffset[inst->dest]], &constPool[inst->src1], 4);
+			memcpy(&mips->v[inst->dest], &constPool[inst->src1], 4);
 			break;
 		case IROp::Add:
 			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
@@ -88,7 +88,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
 			break;
 		case IROp::LoadFloatV:
-			mips->v[voffset[inst->dest]] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			mips->v[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
 			break;
 
 		case IROp::Store8:
@@ -104,7 +104,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
 		case IROp::StoreFloatV:
-			Memory::WriteUnchecked_Float(mips->v[voffset[inst->src3]], mips->r[inst->src1] + constPool[inst->src2]);
+			Memory::WriteUnchecked_Float(mips->v[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
 
 		case IROp::ShlImm:
@@ -314,10 +314,10 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 
 		case IROp::VMovFromGPR:
-			memcpy(&mips->v[voffset[inst->dest]], &mips->r[inst->src1], 4);
+			memcpy(&mips->v[inst->dest], &mips->r[inst->src1], 4);
 			break;
 		case IROp::VMovToGPR:
-			memcpy(&mips->r[inst->dest], &mips->v[voffset[inst->src1]], 4);
+			memcpy(&mips->r[inst->dest], &mips->v[inst->src1], 4);
 			break;
 
 		case IROp::ExitToConst:

From 558bb197c795d3caca1f6f52111a980ce2569c2b Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Mon, 9 May 2016 23:47:56 +0200
Subject: [PATCH 44/77] More VFPU

---
 Core/MIPS/IR/IRCompALU.cpp       |  4 +-
 Core/MIPS/IR/IRCompBranch.cpp    |  2 +-
 Core/MIPS/IR/IRCompFPU.cpp       |  2 +-
 Core/MIPS/IR/IRCompLoadStore.cpp |  2 +-
 Core/MIPS/IR/IRCompVFPU.cpp      | 64 +++++++++++++++++++++++---------
 Core/MIPS/IR/IRFrontend.cpp      |  4 +-
 Core/MIPS/IR/IRInst.cpp          |  2 +
 Core/MIPS/IR/IRInst.h            | 11 ++++--
 Core/MIPS/IR/IRInterpreter.cpp   | 27 ++++++++++++++
 Core/MIPS/IR/IRPassSimplify.cpp  |  4 +-
 Core/MIPS/MIPS.h                 | 21 ++++++-----
 11 files changed, 104 insertions(+), 39 deletions(-)

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
index 6500d1b0df11..8ffa632affd2 100644
--- a/Core/MIPS/IR/IRCompALU.cpp
+++ b/Core/MIPS/IR/IRCompALU.cpp
@@ -19,11 +19,9 @@
 
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Common/CPUDetect.h"
 
-using namespace MIPSAnalyst;
-
 #define _RS MIPS_GET_RS(op)
 #define _RT MIPS_GET_RT(op)
 #define _RD MIPS_GET_RD(op)
diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 3dda003b562d..76833bf32906 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -28,7 +28,7 @@
 #include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/MIPSTables.h"
 
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 
 #include "Common/Arm64Emitter.h"
diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 1ca4a08e96ac..068a58013a87 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -21,7 +21,7 @@
 #include "Core/MIPS/MIPSCodeUtils.h"
 #include "Core/MIPS/MIPSTables.h"
 
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 #include "Common/CPUDetect.h"
 
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
index 41c76a1a7d83..b890f4ff6808 100644
--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -42,7 +42,7 @@
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/MIPSCodeUtils.h"
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 
 #define _RS MIPS_GET_RS(op)
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index a2a3295c6e27..2bb96e754942 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -27,7 +27,7 @@
 #include "Core/Config.h"
 #include "Core/Reporting.h"
 
-#include "Core/MIPS/IR/IRJit.h"
+#include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 
 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
@@ -50,6 +50,15 @@
 #define _IMM26 (op & 0x03FFFFFF)
 
 namespace MIPSComp {
+	static void ApplyVoffset(u8 regs[4], int count) {
+		for (int i = 0; i < count; i++) {
+			regs[i] = voffset[regs[i]];
+		}
+	}
+
+	static bool IsConsecutive4(const u8 regs[4]) {
+		return (regs[1] == regs[0] + 1 && regs[2] == regs[1] + 1 && regs[3] == regs[2] + 1);
+	}
 
 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
 		CONDITIONAL_DISABLE;
@@ -177,7 +186,21 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_SV(MIPSOpcode op) {
-		DISABLE;
+		s32 offset = (signed short)(op & 0xFFFC);
+		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
+		MIPSGPReg rs = _RS;
+		switch (op >> 26) {
+		case 50: //lv.s
+			ir.Write(IROp::LoadFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			break;
+
+		case 58: //sv.s
+			ir.Write(IROp::StoreFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			break;
+
+		default:
+			DISABLE;
+		}
 	}
 
 	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
@@ -187,27 +210,32 @@ namespace MIPSComp {
 
 		u8 vregs[4];
 		GetVectorRegs(vregs, V_Quad, vt);
+		ApplyVoffset(vregs, 4);  // Translate to memory order
 
 		switch (op >> 26) {
 		case 54: //lv.q
-		{
-			// TODO: Add vector load/store instruction to the IR
-			ir.Write(IROp::LoadFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::LoadFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
-		}
-		break;
+			if (IsConsecutive4(vregs)) {
+				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
+			} else {
+				// Let's not even bother with "vertical" loads for now.
+				ir.Write(IROp::LoadFloatV, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::LoadFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::LoadFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::LoadFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			}
+			break;
 
 		case 62: //sv.q
-		{
-			// CC might be set by slow path below, so load regs first.
-			ir.Write(IROp::StoreFloatV, voffset[vregs[0]], rs, ir.AddConstant(imm));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[1]], rs, ir.AddConstant(imm + 4));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[2]], rs, ir.AddConstant(imm + 8));
-			ir.Write(IROp::StoreFloatV, voffset[vregs[3]], rs, ir.AddConstant(imm + 12));
-		}
-		break;
+			if (IsConsecutive4(vregs)) {
+				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
+			} else {
+				// Let's not even bother with "vertical" stores for now.
+				ir.Write(IROp::StoreFloatV, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::StoreFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::StoreFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::StoreFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+			}
+			break;
 
 		default:
 			DISABLE;
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 95ce9d9c4b02..3b13978b43bf 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -236,8 +236,8 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
 		code = &simplified;
-		if (ir.GetInstructions().size() >= 24)
-			logBlocks = 1;
+		//if (ir.GetInstructions().size() >= 24)
+		//	logBlocks = 1;
 	}
 
 	instructions = code->GetInstructions();
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 6b3231ce6d9c..d82e72ccdb86 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -60,11 +60,13 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load32, "Load32", "GGC" },
 	{ IROp::LoadFloat, "LoadFloat", "FGC" },
 	{ IROp::LoadFloatV, "LoadFloatV", "VGC" },
+	{ IROp::LoadVec4, "LoadVec4", "VGC" },
 	{ IROp::Store8, "Store8", "GGC" },
 	{ IROp::Store16, "Store16", "GGC" },
 	{ IROp::Store32, "Store32", "GGC" },
 	{ IROp::StoreFloat, "StoreFloat", "FGC" },
 	{ IROp::StoreFloatV, "StoreFloatV", "VGC" },
+	{ IROp::StoreVec4, "StoreVec4", "VGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 77d71ed91534..70f0e0ff6e82 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -90,12 +90,14 @@ enum class IROp : u8 {
 	Load32,
 	LoadFloat,
 	LoadFloatV,
+	LoadVec4,
 
 	Store8,
 	Store16,
 	Store32,
 	StoreFloat,
 	StoreFloatV,
+	StoreVec4,
 
 	Ext8to32,
 	Ext16to32,
@@ -212,13 +214,16 @@ enum {
 	IRTEMP_LHS,  // Reserved for use in branches
 	IRTEMP_RHS,  // Reserved for use in branches
 
+	// 16 float temps for vector S and T prefixes and things like that.
+	// IRVTEMP_0 = 208 - 64,  // -64 to be relative to v[0]
+
 	// Hacky way to get to other state
-	IRREG_VPFU_CTRL_BASE = 208,
-	IRREG_VPFU_CC = 211,
+	IRREG_VFPU_CTRL_BASE = 208,
+	IRREG_VFPU_CC = 211,
 	IRREG_LO = 226,  // offset of lo in MIPSState / 4
 	IRREG_HI = 227,
 	IRREG_FCR31 = 228,
-	IRREG_FPCOND = 229
+	IRREG_FPCOND = 229,
 };
 
 struct IRMeta {
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 54f20edb4a76..63e0bd533ef4 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -1,3 +1,7 @@
+#ifdef _M_SSE
+#include <smmintrin.h>
+#endif
+
 #include "Core/MemMap.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
@@ -107,6 +111,29 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			Memory::WriteUnchecked_Float(mips->v[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
 
+		case IROp::LoadVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps((const float *)Memory::GetPointerUnchecked(base)));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->v[inst->dest + i] = Memory::ReadUnchecked_Float(base + 4 * i);
+#endif
+			break;
+		}
+		case IROp::StoreVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps((float *)Memory::GetPointerUnchecked(base), _mm_load_ps(&mips->v[inst->dest]));
+#else
+			for (int i = 0; i < 4; i++)
+				Memory::WriteUnchecked_Float(mips->v[inst->dest + i], base + 4 * i);
+#endif
+			break;
+		}
+
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
 			break;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 0e5353ff5717..50bfca890357 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -291,6 +291,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 		case IROp::StoreFloat:
 		case IROp::StoreFloatV:
+		case IROp::StoreVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@@ -314,6 +315,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 		case IROp::LoadFloat:
 		case IROp::LoadFloatV:
+		case IROp::LoadVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
 			} else {
@@ -388,7 +390,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			goto doDefault;
 
 		case IROp::VfpuCtrlToReg:
-			gpr.MapDirtyIn(inst.dest, IRREG_VPFU_CTRL_BASE + inst.src1);
+			gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
 			goto doDefault;
 
 		case IROp::Syscall:
diff --git a/Core/MIPS/MIPS.h b/Core/MIPS/MIPS.h
index bbc9952c4dc1..d3a01f1bde31 100644
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@@ -86,6 +86,7 @@ enum MIPSGPReg {
 	MIPS_REG_RA=31,
 
 	// Not real regs, just for convenience/jit mapping.
+	// NOTE: These are not the same as the offsets the IR has to use!
 	MIPS_REG_HI = 32,
 	MIPS_REG_LO = 33,
 	MIPS_REG_FPCOND = 34,
@@ -155,7 +156,7 @@ class MIPSState
 
 	void DoState(PointerWrap &p);
 
-	// MUST start with r and be followed by f!
+	// MUST start with r and be followed by f, v, and t!
 	u32 r[32];
 	union {
 		float f[32];
@@ -166,23 +167,25 @@ class MIPSState
 		float v[128];
 		u32 vi[128];
 	};
-	// Used for temporary variables by IR Interpreter.
+
+	// Register-allocated JIT Temps don't get flushed so we don't reserve space for them.
+	// However, the IR interpreter needs some temps that can stick around between ops.
 	// Can be indexed through r[] using indices 192+.
-	u32 t[16];
+	u32 t[16];     //192
+	// float vt[16];  //208  TODO: VFPU temp
 
-	// Temps don't get flushed so we don't reserve space for them.
 	// If vfpuCtrl (prefixes) get mysterious values, check the VFPU regcache code.
-	u32 vfpuCtrl[16];
+	u32 vfpuCtrl[16]; // 208
 
 	// ARM64 wants lo/hi to be aligned to 64 bits from the base of this struct.
-	u32 padLoHi;
+	u32 padLoHi;    // 224
 
 	union {
 		struct {
-			u32 pc;
+			u32 pc;   //225
 
-			u32 lo;  // offset 192 + 16 + 16 + 1 + 1
-			u32 hi;
+			u32 lo;   //226
+			u32 hi;   //227
 
 			u32 fcr31; //fpu control register
 			u32 fpcond;  // cache the cond flag of fcr31  (& 1 << 23)

From 45efcda6b1cd16f167e9310b78b91dd487e682e8 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Tue, 10 May 2016 21:50:08 +0200
Subject: [PATCH 45/77] IR: Some more VFPU

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 84 ++++++++++++++++++++++++++++++---
 Core/MIPS/IR/IRInst.cpp         | 21 +++++++++
 Core/MIPS/IR/IRInst.h           | 21 +++++++++
 Core/MIPS/IR/IRInterpreter.cpp  | 38 +++++++++++++++
 Core/MIPS/IR/IRJit.cpp          |  6 +--
 Core/MIPS/IR/IRPassSimplify.cpp | 20 +++++++-
 Core/MIPS/MIPSVFPUUtils.h       | 14 +++++-
 Core/MIPS/x86/CompVFPU.cpp      |  5 +-
 8 files changed, 195 insertions(+), 14 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 2bb96e754942..e67b93cdddbd 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -16,6 +16,7 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include <cmath>
+
 #include "math/math_util.h"
 
 #include "Core/MemMap.h"
@@ -57,7 +58,9 @@ namespace MIPSComp {
 	}
 
 	static bool IsConsecutive4(const u8 regs[4]) {
-		return (regs[1] == regs[0] + 1 && regs[2] == regs[1] + 1 && regs[3] == regs[2] + 1);
+		return regs[1] == regs[0] + 1 &&
+			     regs[2] == regs[1] + 1 &&
+			     regs[3] == regs[2] + 1;
 	}
 
 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
@@ -244,15 +247,79 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int type = (op >> 16) & 0xF;
+		int vd = _VD;
+
+		if (sz == 4 && IsVectorColumn(vd)) {
+			u8 dregs[4];
+			GetVectorRegs(dregs, sz, vd);
+			ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
+		} else if (sz == 1) {
+			ir.Write(IROp::SetConstV, voffset[vd], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+		} else {
+			DISABLE;
+		}
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		int vd = _VD;
+		VectorSize sz = GetVecSize(op);
+		if (sz != V_Quad)
+			DISABLE;
+
+		if (!IsVectorColumn(vd))
+			DISABLE;
+
+		u8 dregs[4];
+		GetVectorRegs(dregs, sz, vd);
+		int row = vd & 3;
+		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
+		ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)init);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
-		DISABLE;
+		MatrixSize sz = GetMtxSize(op);
+		if (sz != M_4x4) {
+			DISABLE;
+		}
+
+		// Not really about trying here, it will work if enabled.
+		VectorSize vsz = GetVectorSize(sz);
+		u8 vecs[4];
+		int vd = _VD;
+		if (IsMatrixTransposed(vd)) {
+			// All outputs are transpositionally symmetric, so should be fine.
+			vd = TransposeMatrixReg(vd);
+		}
+		GetMatrixColumns(vd, M_4x4, vecs);
+		for (int i = 0; i < 4; i++) {
+			u8 vec[4];
+			GetVectorRegs(vec, vsz, vecs[i]);
+			// As they are columns, they will be nicely consecutive.
+			Vec4Init init;
+			switch ((op >> 16) & 0xF) {
+			case 3:
+				init = Vec4Init((int)Vec4Init::Set_1000 + i);
+				break;
+			case 6:
+				init = Vec4Init::AllZERO;
+				break;
+			case 7:
+				init = Vec4Init::AllONE;
+				break;
+			default:
+				return;
+			}
+			ir.Write(IROp::InitVec4, voffset[vec[0]], (int)init);
+		}
+		return;
 	}
 
 	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
@@ -275,7 +342,7 @@ namespace MIPSComp {
 
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
-		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
+		// Eliminate silly no-op VMOVs, common in Wipeout Pure
 		if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
 			return;
 		}
@@ -379,7 +446,12 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
-		DISABLE;
+		if (!js.HasNoPrefix())
+			DISABLE;
+
+		u8 dreg = _VT;
+		s32 imm = (s32)(s16)(u16)(op & 0xFFFF);
+		ir.Write(IROp::SetConstV, voffset[dreg], ir.AddConstantFloat((float)imm));
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index d82e72ccdb86..e9bc55ab7844 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -85,9 +85,18 @@ static const IRMeta irMeta[] = {
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::VMovFromGPR, "VMovFromGPR", "VG" },
 	{ IROp::VMovToGPR, "VMovToGPR", "GV" },
+	{ IROp::InitVec4, "InitVec4", "Vv"},
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
+
+	{ IROp::VSin, "VSin", "VV" },
+	{ IROp::VCos, "VCos", "VV" },
+	{ IROp::VSqrt, "VSqrt", "VV" },
+	{ IROp::VRSqrt, "VRSqrt", "VV" },
+	{ IROp::VRecip, "VRecip", "VV" },
+	{ IROp::VAsin, "VAsin", "VV" },
+
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
 	{ IROp::ExitToConst, "Exit", "C" },
@@ -177,6 +186,15 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 		"RCX6",
 		"RCX7",
 	};
+	static const char *initVec4Names[8] = {
+		"[0 0 0 0]",
+		"[1 1 1 1]",
+		"[-1 -1 -1 -1]",
+		"[1 0 0 0]",
+		"[0 1 0 0]",
+		"[0 0 1 0]",
+		"[0 0 0 1]",
+	};
 
 	switch (type) {
 	case 'G':
@@ -197,6 +215,9 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'T':
 		snprintf(buf, bufSize, "%s", vfpuCtrlNames[param]);
 		break;
+	case 'v':
+		snprintf(buf, bufSize, "%s", initVec4Names[param]);
+		break;
 	case '_':
 	case '\0':
 		buf[0] = 0;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 70f0e0ff6e82..e2c0f6644a33 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -144,6 +144,16 @@ enum class IROp : u8 {
 	VMovFromGPR,
 	VMovToGPR,
 
+	InitVec4,
+
+	// Slow special functions. Used on singles.
+	VSin,
+	VCos,
+	VSqrt,
+	VRSqrt,
+	VRecip,
+	VAsin,
+
 	// Fake/System instructions
 	Interpret,
 
@@ -181,6 +191,17 @@ enum IRComparison {
 	Bad,
 };
 
+// Some common vec4 constants.
+enum class Vec4Init {
+	AllZERO,
+	AllONE,
+	AllMinusONE,
+	Set_1000,
+	Set_0100,
+	Set_0010,
+	Set_0001,
+};
+
 // Hm, unused
 inline IRComparison Invert(IRComparison comp) {
 	switch (comp) {
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 63e0bd533ef4..2a601bb8f8a1 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -6,6 +6,7 @@
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MIPS/MIPSTables.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
 
 #include "math/math_util.h"
 #include "Common/CommonTypes.h"
@@ -14,6 +15,16 @@
 #include "Core/MIPS/IR/IRInst.h"
 #include "Core/MIPS/IR/IRInterpreter.h"
 
+alignas(16) float vec4InitValues[8][4] = {
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ 1.0f, 1.0f, 1.0f, 1.0f },
+	{ -1.0f, -1.0f, -1.0f, -1.0f },
+	{ 1.0f, 0.0f, 0.0f, 0.0f },
+	{ 0.0f, 1.0f, 0.0f, 0.0f },
+	{ 0.0f, 0.0f, 1.0f, 0.0f },
+	{ 0.0f, 0.0f, 0.0f, 1.0f },
+};
+
 u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
 	const IRInst *end = inst + count;
 	while (inst != end) {
@@ -134,6 +145,33 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 
+		case IROp::InitVec4:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
+#else
+			memcpy(&mips->v[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
+#endif
+			break;
+
+		case IROp::VSin:
+			mips->v[inst->dest] = vfpu_sin(mips->v[inst->src1]);
+			break;
+		case IROp::VCos:
+			mips->v[inst->dest] = vfpu_cos(mips->v[inst->src1]);
+			break;
+		case IROp::VSqrt:
+			mips->v[inst->dest] = sqrtf(mips->v[inst->src1]);
+			break;
+		case IROp::VRSqrt:
+			mips->v[inst->dest] = 1.0f / sqrtf(mips->v[inst->src1]);
+			break;
+		case IROp::VRecip:
+			mips->v[inst->dest] = 1.0f / mips->v[inst->src1];
+			break;
+		case IROp::VAsin:
+			mips->v[inst->dest] = vfpu_asin(mips->v[inst->src1]);
+			break;
+
 		case IROp::ShlImm:
 			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
 			break;
diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
index 661543a9d0df..fb490268559d 100644
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@@ -114,9 +114,9 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 		}
 		while (mips_->downcount >= 0) {
 			u32 inst = Memory::ReadUnchecked_U32(mips_->pc);
-			u32 opcode = inst >> 24;
-			u32 data = inst & 0xFFFFFF;
-			if (opcode == (MIPS_EMUHACK_OPCODE >> 24)) {
+			u32 opcode = inst & 0xFF000000;
+			if (opcode == MIPS_EMUHACK_OPCODE) {
+				u32 data = inst & 0xFFFFFF;
 				IRBlock *block = blocks_.GetBlock(data);
 				mips_->pc = IRInterpret(mips_, block->GetInstructions(), block->GetConstants(), block->GetNumInstructions());
 			} else {
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 50bfca890357..d7c93593f9ab 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -340,8 +340,13 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		// FP-only instructions don't need to flush immediates.
 		case IROp::FAdd:
 		case IROp::FMul:
-		case IROp::FDiv:
+			// Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1)
+			if (inst.src2 == inst.dest && inst.src1 != inst.src2)
+				std::swap(inst.src1, inst.src2);
+			out.Write(inst);
+			break;
 		case IROp::FSub:
+		case IROp::FDiv:
 		case IROp::FNeg:
 		case IROp::FAbs:
 		case IROp::FSqrt:
@@ -373,6 +378,19 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
+		case IROp::InitVec4:
+			out.Write(inst);
+			break;
+
+		case IROp::VSin:
+		case IROp::VCos:
+		case IROp::VSqrt:
+		case IROp::VRSqrt:
+		case IROp::VRecip:
+		case IROp::VAsin:
+			out.Write(inst);
+			break;
+
 		case IROp::ZeroFpCond:
 		case IROp::FCmpUnordered:
 		case IROp::FCmpEqual:
diff --git a/Core/MIPS/MIPSVFPUUtils.h b/Core/MIPS/MIPSVFPUUtils.h
index bb8403217f3d..7f6ada0fa2c6 100644
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@@ -45,6 +45,10 @@ inline float vfpu_cos(float angle) {
 	return cosf(angle);
 }
 
+inline float vfpu_asin(float angle) {
+	return asinf(angle) / M_PI_2;
+}
+
 inline void vfpu_sincos(float angle, float &sine, float &cosine) {
 	angle -= floorf(angle * 0.25f) * 4.f;
 	angle *= (float)M_PI_2;
@@ -127,7 +131,15 @@ int GetNumVectorElements(VectorSize sz);
 int GetMatrixSide(MatrixSize sz);
 const char *GetVectorNotation(int reg, VectorSize size);
 const char *GetMatrixNotation(int reg, MatrixSize size);
-
+inline bool IsMatrixTransposed(int matrixReg) {
+	return (matrixReg >> 5) & 1;
+}
+inline bool IsVectorColumn(int vectorReg) {
+	return !((vectorReg >> 5) & 1);
+}
+inline int TransposeMatrixReg(int matrixReg) {
+	return matrixReg ^ 0x20;
+}
 int GetVectorOverlap(int reg1, VectorSize size1, int reg2, VectorSize size2);
 
 float Float16ToFloat32(unsigned short l);
diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index fcc51c364659..749967f53a61 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -101,8 +101,7 @@ void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
 	for (int i = 0; i < n; i++)
 		origV[i] = vregs[i];
 
-	for (int i = 0; i < n; i++)
-	{
+	for (int i = 0; i < n; i++) {
 		int regnum = (prefix >> (i*2)) & 3;
 		int abs    = (prefix >> (8+i)) & 1;
 		int negate = (prefix >> (16+i)) & 1;
@@ -2142,7 +2141,7 @@ void CosOnly(SinCosArg angle) {
 }
 
 void ASinScaled(SinCosArg angle) {
-	sincostemp[0] = asinf(angle) / M_PI_2;
+	sincostemp[0] = vfpu_asin(angle);
 }
 
 void SinCosNegSin(SinCosArg angle) {

From db1d1ff9fdfdfaa1bf382060308629eab82aee9d Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Tue, 10 May 2016 22:55:27 +0200
Subject: [PATCH 46/77] IR: Merge the FPU and VFPU instruction sets, no reason
 to keep them apart

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 38 ++++++++++++------------
 Core/MIPS/IR/IRInst.cpp         | 32 +++++++++------------
 Core/MIPS/IR/IRInst.h           | 16 ++++-------
 Core/MIPS/IR/IRInterpreter.cpp  | 51 +++++++++++----------------------
 Core/MIPS/IR/IRPassSimplify.cpp | 33 +++++----------------
 5 files changed, 62 insertions(+), 108 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index e67b93cdddbd..8f35cbef86fe 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -50,10 +50,12 @@
 #define _IMM16 (signed short)(op & 0xFFFF)
 #define _IMM26 (op & 0x03FFFFFF)
 
+const int vfpuBase = 32;  // skip the FP registers
+
 namespace MIPSComp {
 	static void ApplyVoffset(u8 regs[4], int count) {
 		for (int i = 0; i < count; i++) {
-			regs[i] = voffset[regs[i]];
+			regs[i] = vfpuBase + voffset[regs[i]];
 		}
 	}
 
@@ -194,11 +196,11 @@ namespace MIPSComp {
 		MIPSGPReg rs = _RS;
 		switch (op >> 26) {
 		case 50: //lv.s
-			ir.Write(IROp::LoadFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			ir.Write(IROp::LoadFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
 			break;
 
 		case 58: //sv.s
-			ir.Write(IROp::StoreFloatV, voffset[vt], rs, ir.AddConstant(offset));
+			ir.Write(IROp::StoreFloat, vfpuBase + voffset[vt], rs, ir.AddConstant(offset));
 			break;
 
 		default:
@@ -221,10 +223,10 @@ namespace MIPSComp {
 				ir.Write(IROp::LoadVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
 				// Let's not even bother with "vertical" loads for now.
-				ir.Write(IROp::LoadFloatV, vregs[0], rs, ir.AddConstant(imm));
-				ir.Write(IROp::LoadFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
-				ir.Write(IROp::LoadFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
-				ir.Write(IROp::LoadFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+				ir.Write(IROp::LoadFloat, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::LoadFloat, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::LoadFloat, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::LoadFloat, vregs[3], rs, ir.AddConstant(imm + 12));
 			}
 			break;
 
@@ -233,10 +235,10 @@ namespace MIPSComp {
 				ir.Write(IROp::StoreVec4, vregs[0], rs, ir.AddConstant(imm));
 			} else {
 				// Let's not even bother with "vertical" stores for now.
-				ir.Write(IROp::StoreFloatV, vregs[0], rs, ir.AddConstant(imm));
-				ir.Write(IROp::StoreFloatV, vregs[1], rs, ir.AddConstant(imm + 4));
-				ir.Write(IROp::StoreFloatV, vregs[2], rs, ir.AddConstant(imm + 8));
-				ir.Write(IROp::StoreFloatV, vregs[3], rs, ir.AddConstant(imm + 12));
+				ir.Write(IROp::StoreFloat, vregs[0], rs, ir.AddConstant(imm));
+				ir.Write(IROp::StoreFloat, vregs[1], rs, ir.AddConstant(imm + 4));
+				ir.Write(IROp::StoreFloat, vregs[2], rs, ir.AddConstant(imm + 8));
+				ir.Write(IROp::StoreFloat, vregs[3], rs, ir.AddConstant(imm + 12));
 			}
 			break;
 
@@ -257,9 +259,9 @@ namespace MIPSComp {
 		if (sz == 4 && IsVectorColumn(vd)) {
 			u8 dregs[4];
 			GetVectorRegs(dregs, sz, vd);
-			ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
+			ir.Write(IROp::InitVec4, vfpuBase + voffset[dregs[0]], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
 		} else if (sz == 1) {
-			ir.Write(IROp::SetConstV, voffset[vd], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+			ir.Write(IROp::SetConstF, vfpuBase + voffset[vd], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
 		} else {
 			DISABLE;
 		}
@@ -281,7 +283,7 @@ namespace MIPSComp {
 		GetVectorRegs(dregs, sz, vd);
 		int row = vd & 3;
 		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
-		ir.Write(IROp::InitVec4, voffset[dregs[0]], (int)init);
+		ir.Write(IROp::InitVec4, vfpuBase + voffset[dregs[0]], (int)init);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
@@ -317,7 +319,7 @@ namespace MIPSComp {
 			default:
 				return;
 			}
-			ir.Write(IROp::InitVec4, voffset[vec[0]], (int)init);
+			ir.Write(IROp::InitVec4, vfpuBase + voffset[vec[0]], (int)init);
 		}
 		return;
 	}
@@ -369,7 +371,7 @@ namespace MIPSComp {
 						// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
 			if (rt != 0) {
 				if (imm < 128) {  //R(rt) = VI(imm);
-					ir.Write(IROp::VMovToGPR, rt, voffset[imm]);
+					ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
 				} else {
 					DISABLE;
 				}
@@ -378,7 +380,7 @@ namespace MIPSComp {
 
 		case 7: // mtv
 			if (imm < 128) {
-				ir.Write(IROp::VMovFromGPR, voffset[imm], rt);
+				ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);
 			} else {
 				DISABLE;
 			}
@@ -451,7 +453,7 @@ namespace MIPSComp {
 
 		u8 dreg = _VT;
 		s32 imm = (s32)(s16)(u16)(op & 0xFFFF);
-		ir.Write(IROp::SetConstV, voffset[dreg], ir.AddConstantFloat((float)imm));
+		ir.Write(IROp::SetConstF, vfpuBase + voffset[dreg], ir.AddConstantFloat((float)imm));
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index e9bc55ab7844..1f80be40b217 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -6,7 +6,6 @@
 static const IRMeta irMeta[] = {
 	{ IROp::SetConst, "SetConst", "GC" },
 	{ IROp::SetConstF, "SetConstF", "FC" },
-	{ IROp::SetConstV, "SetConstV", "VC" },
 	{ IROp::Mov, "Mov", "GG" },
 	{ IROp::Add, "Add", "GGG" },
 	{ IROp::Sub, "Sub", "GGG" },
@@ -59,14 +58,12 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load16Ext, "Load16Ext", "GGC" },
 	{ IROp::Load32, "Load32", "GGC" },
 	{ IROp::LoadFloat, "LoadFloat", "FGC" },
-	{ IROp::LoadFloatV, "LoadFloatV", "VGC" },
-	{ IROp::LoadVec4, "LoadVec4", "VGC" },
+	{ IROp::LoadVec4, "LoadVec4", "FGC" },
 	{ IROp::Store8, "Store8", "GGC" },
 	{ IROp::Store16, "Store16", "GGC" },
 	{ IROp::Store32, "Store32", "GGC" },
 	{ IROp::StoreFloat, "StoreFloat", "FGC" },
-	{ IROp::StoreFloatV, "StoreFloatV", "VGC" },
-	{ IROp::StoreVec4, "StoreVec4", "VGC" },
+	{ IROp::StoreVec4, "StoreVec4", "FGC" },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
@@ -83,19 +80,17 @@ static const IRMeta irMeta[] = {
 	{ IROp::FCvtSW, "FCvtSW", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
-	{ IROp::VMovFromGPR, "VMovFromGPR", "VG" },
-	{ IROp::VMovToGPR, "VMovToGPR", "GV" },
-	{ IROp::InitVec4, "InitVec4", "Vv"},
+	{ IROp::InitVec4, "InitVec4", "Fv"},
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 
-	{ IROp::VSin, "VSin", "VV" },
-	{ IROp::VCos, "VCos", "VV" },
-	{ IROp::VSqrt, "VSqrt", "VV" },
-	{ IROp::VRSqrt, "VRSqrt", "VV" },
-	{ IROp::VRecip, "VRecip", "VV" },
-	{ IROp::VAsin, "VAsin", "VV" },
+	{ IROp::FSin, "FSin", "FF" },
+	{ IROp::FCos, "FCos", "FF" },
+	{ IROp::FSqrt, "FSqrt", "FF" },
+	{ IROp::FRSqrt, "FRSqrt", "FF" },
+	{ IROp::FRecip, "FRecip", "FF" },
+	{ IROp::FAsin, "FAsin", "FF" },
 
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
@@ -201,7 +196,11 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 		snprintf(buf, bufSize, "%s", GetGPRName(param));
 		break;
 	case 'F':
-		snprintf(buf, bufSize, "f%d", param);
+		if (param >= 32) {
+			snprintf(buf, bufSize, "v%d", param - 32);
+		} else {
+			snprintf(buf, bufSize, "f%d", param);
+		}
 		break;
 	case 'C':
 		snprintf(buf, bufSize, "%08x", constPool[param]);
@@ -209,9 +208,6 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'I':
 		snprintf(buf, bufSize, "%02x", param);
 		break;
-	case 'V':
-		snprintf(buf, bufSize, "v%d", param);
-		break;
 	case 'T':
 		snprintf(buf, bufSize, "%s", vfpuCtrlNames[param]);
 		break;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index e2c0f6644a33..1c033b0ce376 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -18,7 +18,6 @@
 enum class IROp : u8 {
 	SetConst,
 	SetConstF,
-	SetConstV,
 
 	Mov,
 
@@ -89,14 +88,12 @@ enum class IROp : u8 {
 	Load16Ext,
 	Load32,
 	LoadFloat,
-	LoadFloatV,
 	LoadVec4,
 
 	Store8,
 	Store16,
 	Store32,
 	StoreFloat,
-	StoreFloatV,
 	StoreVec4,
 
 	Ext8to32,
@@ -141,18 +138,15 @@ enum class IROp : u8 {
 	UpdateRoundingMode,
 
 	SetCtrlVFPU,
-	VMovFromGPR,
-	VMovToGPR,
 
 	InitVec4,
 
 	// Slow special functions. Used on singles.
-	VSin,
-	VCos,
-	VSqrt,
-	VRSqrt,
-	VRecip,
-	VAsin,
+	FSin,
+	FCos,
+	FRSqrt,
+	FRecip,
+	FAsin,
 
 	// Fake/System instructions
 	Interpret,
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 2a601bb8f8a1..6c71682c18a2 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -35,9 +35,6 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::SetConstF:
 			memcpy(&mips->f[inst->dest], &constPool[inst->src1], 4);
 			break;
-		case IROp::SetConstV:
-			memcpy(&mips->v[inst->dest], &constPool[inst->src1], 4);
-			break;
 		case IROp::Add:
 			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
 			break;
@@ -102,9 +99,6 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::LoadFloat:
 			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
 			break;
-		case IROp::LoadFloatV:
-			mips->v[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
-			break;
 
 		case IROp::Store8:
 			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
@@ -118,18 +112,15 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::StoreFloat:
 			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
 			break;
-		case IROp::StoreFloatV:
-			Memory::WriteUnchecked_Float(mips->v[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
-			break;
 
 		case IROp::LoadVec4:
 		{
 			u32 base = mips->r[inst->src1] + constPool[inst->src2];
 #if defined(_M_SSE)
-			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps((const float *)Memory::GetPointerUnchecked(base)));
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps((const float *)Memory::GetPointerUnchecked(base)));
 #else
 			for (int i = 0; i < 4; i++)
-				mips->v[inst->dest + i] = Memory::ReadUnchecked_Float(base + 4 * i);
+				mips->f[inst->dest + i] = Memory::ReadUnchecked_Float(base + 4 * i);
 #endif
 			break;
 		}
@@ -137,39 +128,36 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		{
 			u32 base = mips->r[inst->src1] + constPool[inst->src2];
 #if defined(_M_SSE)
-			_mm_store_ps((float *)Memory::GetPointerUnchecked(base), _mm_load_ps(&mips->v[inst->dest]));
+			_mm_store_ps((float *)Memory::GetPointerUnchecked(base), _mm_load_ps(&mips->f[inst->dest]));
 #else
 			for (int i = 0; i < 4; i++)
-				Memory::WriteUnchecked_Float(mips->v[inst->dest + i], base + 4 * i);
+				Memory::WriteUnchecked_Float(mips->f[inst->dest + i], base + 4 * i);
 #endif
 			break;
 		}
 
 		case IROp::InitVec4:
 #if defined(_M_SSE)
-			_mm_store_ps(&mips->v[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
 #else
-			memcpy(&mips->v[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
+			memcpy(&mips->f[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
 #endif
 			break;
 
-		case IROp::VSin:
-			mips->v[inst->dest] = vfpu_sin(mips->v[inst->src1]);
-			break;
-		case IROp::VCos:
-			mips->v[inst->dest] = vfpu_cos(mips->v[inst->src1]);
+		case IROp::FSin:
+			mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);
 			break;
-		case IROp::VSqrt:
-			mips->v[inst->dest] = sqrtf(mips->v[inst->src1]);
+		case IROp::FCos:
+			mips->f[inst->dest] = vfpu_cos(mips->f[inst->src1]);
 			break;
-		case IROp::VRSqrt:
-			mips->v[inst->dest] = 1.0f / sqrtf(mips->v[inst->src1]);
+		case IROp::FRSqrt:
+			mips->f[inst->dest] = 1.0f / sqrtf(mips->f[inst->src1]);
 			break;
-		case IROp::VRecip:
-			mips->v[inst->dest] = 1.0f / mips->v[inst->src1];
+		case IROp::FRecip:
+			mips->f[inst->dest] = 1.0f / mips->f[inst->src1];
 			break;
-		case IROp::VAsin:
-			mips->v[inst->dest] = vfpu_asin(mips->v[inst->src1]);
+		case IROp::FAsin:
+			mips->f[inst->dest] = vfpu_asin(mips->f[inst->src1]);
 			break;
 
 		case IROp::ShlImm:
@@ -378,13 +366,6 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
 			break;
 
-		case IROp::VMovFromGPR:
-			memcpy(&mips->v[inst->dest], &mips->r[inst->src1], 4);
-			break;
-		case IROp::VMovToGPR:
-			memcpy(&mips->r[inst->dest], &mips->v[inst->src1], 4);
-			break;
-
 		case IROp::ExitToConst:
 			return constPool[inst->dest];
 
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index d7c93593f9ab..e846c8420a1c 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -110,7 +110,6 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			gpr.SetImm(inst.dest, constants[inst.src1]);
 			break;
 		case IROp::SetConstF:
-		case IROp::SetConstV:
 			goto doDefault;
 
 		case IROp::Sub:
@@ -251,23 +250,10 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
-		case IROp::VMovFromGPR:
-			if (gpr.IsImm(inst.src1)) {
-				out.Write(IROp::SetConstV, inst.dest, out.AddConstant(gpr.GetImm(inst.src1)));
-			} else {
-				gpr.MapIn(inst.src1);
-				goto doDefault;
-			}
-			break;
-
 		case IROp::FMovToGPR:
 			gpr.MapDirty(inst.dest);
 			goto doDefault;
 
-		case IROp::VMovToGPR:
-			gpr.MapDirty(inst.dest);
-			goto doDefault;
-
 		case IROp::MfHi:
 		case IROp::MfLo:
 			gpr.MapDirty(inst.dest);
@@ -290,7 +276,6 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 		case IROp::StoreFloat:
-		case IROp::StoreFloatV:
 		case IROp::StoreVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
@@ -314,7 +299,6 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 		case IROp::LoadFloat:
-		case IROp::LoadFloatV:
 		case IROp::LoadVec4:
 			if (gpr.IsImm(inst.src1)) {
 				out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + constants[inst.src2]));
@@ -345,17 +329,23 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 				std::swap(inst.src1, inst.src2);
 			out.Write(inst);
 			break;
+
 		case IROp::FSub:
 		case IROp::FDiv:
 		case IROp::FNeg:
 		case IROp::FAbs:
-		case IROp::FSqrt:
 		case IROp::FMov:
 		case IROp::FRound:
 		case IROp::FTrunc:
 		case IROp::FCeil:
 		case IROp::FFloor:
 		case IROp::FCvtSW:
+		case IROp::FSin:
+		case IROp::FCos:
+		case IROp::FSqrt:
+		case IROp::FRSqrt:
+		case IROp::FRecip:
+		case IROp::FAsin:
 			out.Write(inst);
 			break;
 
@@ -382,15 +372,6 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			out.Write(inst);
 			break;
 
-		case IROp::VSin:
-		case IROp::VCos:
-		case IROp::VSqrt:
-		case IROp::VRSqrt:
-		case IROp::VRecip:
-		case IROp::VAsin:
-			out.Write(inst);
-			break;
-
 		case IROp::ZeroFpCond:
 		case IROp::FCmpUnordered:
 		case IROp::FCmpEqual:

From b3dd36982f7a92596705f777e5bb42ef7c5eeb57 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Tue, 10 May 2016 23:14:26 +0200
Subject: [PATCH 47/77] Prefix prep

---
 Core/MIPS/IR/IRCompVFPU.cpp    | 26 +++++++++-----------------
 Core/MIPS/IR/IRInst.cpp        |  2 ++
 Core/MIPS/IR/IRInst.h          |  3 +++
 Core/MIPS/IR/IRInterpreter.cpp |  7 +++++++
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 8f35cbef86fe..2346473786e8 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -152,17 +152,21 @@ namespace MIPSComp {
 		int n = GetNumVectorElements(sz);
 		for (int i = 0; i < n; i++) {
 			// Hopefully this is rare, we'll just write it into a reg we drop.
-			//if (js.VfpuWriteMask(i))
-			//	regs[i] = fpr.GetTempV();
+			if (js.VfpuWriteMask(i))
+				regs[i] = fpr.GetTempV();
 		}
 	}
 
+	inline int GetDSat(int prefix, int i) {
+		return (prefix >> (i * 2)) & 3;
+	}
+
+	// "D" prefix is really a post process. No need to allocate a temporary register.
 	void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 		if (!js.prefixD)
 			return;
 
-		/*
 		int n = GetNumVectorElements(sz);
 		for (int i = 0; i < n; i++) {
 			if (js.VfpuWriteMask(i))
@@ -171,23 +175,11 @@ namespace MIPSComp {
 			int sat = (js.prefixD >> (i * 2)) & 3;
 			if (sat == 1) {
 				// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
-				fpr.MapRegV(vregs[i], MAP_DIRTY);
-
-				fp.MOVI2F(S0, 0.0f, SCRATCH1);
-				fp.MOVI2F(S1, 1.0f, SCRATCH1);
-				fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
-				fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
+				ir.Write(IROp::FSat0_1, vfpuBase + voffset[vregs[i]], vfpuBase + voffset[vregs[i]]);
 			} else if (sat == 3) {
-				// clamped = x < -1 ? (x > 1 ? 1 : x) : x [-1, 1]
-				fpr.MapRegV(vregs[i], MAP_DIRTY);
-
-				fp.MOVI2F(S0, -1.0f, SCRATCH1);
-				fp.MOVI2F(S1, 1.0f, SCRATCH1);
-				fp.FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), S1);
-				fp.FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
+				ir.Write(IROp::FSatMinus1_1, vfpuBase + voffset[vregs[i]], vfpuBase + voffset[vregs[i]]);
 			}
 		}
-		*/
 	}
 
 	void IRFrontend::Comp_SV(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 1f80be40b217..469c97cf5818 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -78,6 +78,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::FFloor, "FFloor", "FF" },
 	{ IROp::FCvtWS, "FCvtWS", "FF" },
 	{ IROp::FCvtSW, "FCvtSW", "FF" },
+	{ IROp::FSat0_1, "FSat(0 - 1)", "FF" },
+	{ IROp::FSatMinus1_1, "FSat(-1 - 1)", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::InitVec4, "InitVec4", "Fv"},
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 1c033b0ce376..322ef2386d3d 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -120,6 +120,9 @@ enum class IROp : u8 {
 	FMovFromGPR,
 	FMovToGPR,
 
+	FSat0_1,
+	FSatMinus1_1,
+
 	FpCondToReg,
 	VfpuCtrlToReg,
 
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 6c71682c18a2..f77216ef0c37 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -303,6 +303,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::FNeg:
 			mips->f[inst->dest] = -mips->f[inst->src1];
 			break;
+		case IROp::FSat0_1:
+			mips->f[inst->dest] = clamp_value(mips->f[inst->src1], 0.0f, 1.0f);
+			break;
+		case IROp::FSatMinus1_1:
+			mips->f[inst->dest] = clamp_value(mips->f[inst->src1], -1.0f, 1.0f);
+			break;
+
 		case IROp::FpCondToReg:
 			mips->r[inst->dest] = mips->fpcond;
 			break;

From 219548b8e28627cebcb14f550a1f7287643a5796 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Wed, 11 May 2016 00:16:07 +0200
Subject: [PATCH 48/77] Prefix prep

---
 Core/MIPS/IR/IRCompVFPU.cpp    | 78 +++++++++++++++++++++-------------
 Core/MIPS/IR/IRFrontend.h      | 16 +++----
 Core/MIPS/IR/IRInst.cpp        | 19 ++++++---
 Core/MIPS/IR/IRInst.h          | 17 ++++++--
 Core/MIPS/IR/IRInterpreter.cpp |  9 ++++
 Core/MIPS/MIPS.h               | 15 ++++---
 6 files changed, 96 insertions(+), 58 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 2346473786e8..c9d0083d2eb3 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -88,7 +88,7 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
+	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {
 		if (prefix == 0xE4)
 			return;
 
@@ -109,13 +109,9 @@ namespace MIPSComp {
 			if (!constants && regnum == i && !abs && !negate)
 				continue;
 
-			/*
 			// This puts the value into a temp reg, so we won't write the modified value back.
-			vregs[i] = fpr.GetTempV();
+			vregs[i] = tempReg + i;
 			if (!constants) {
-				fpr.MapDirtyInV(vregs[i], origV[regnum]);
-				fpr.SpillLockV(vregs[i]);
-
 				// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
 				// TODO: But some ops seem to use const 0 instead?
 				if (regnum >= n) {
@@ -124,36 +120,58 @@ namespace MIPSComp {
 				}
 
 				if (abs) {
-					fp.FABS(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+					ir.Write(IROp::FAbs, vregs[i], origV[regnum]);
 					if (negate)
-						fp.FNEG(fpr.V(vregs[i]), fpr.V(vregs[i]));
+						ir.Write(IROp::FNeg, vregs[i], vregs[i]);
 				} else {
 					if (negate)
-						fp.FNEG(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+						ir.Write(IROp::FNeg, vregs[i], origV[regnum]);
 					else
-						fp.FMOV(fpr.V(vregs[i]), fpr.V(origV[regnum]));
+						ir.Write(IROp::FMov, vregs[i], origV[regnum]);
 				}
 			} else {
-				fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT);
-				fpr.SpillLockV(vregs[i]);
-				fp.MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs << 2)], SCRATCH1, (bool)negate);
+				if (negate) {
+					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(-constantArray[regnum + (abs << 2)]));
+				} else {
+					ir.Write(IROp::SetConstF, vregs[i], ir.AddConstantFloat(constantArray[regnum + (abs << 2)]));
+				}
 			}
-			*/
 		}
 	}
 
+	void IRFrontend::GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
+		::GetVectorRegs(regs, N, vectorReg);
+		ApplyVoffset(regs, N);
+	}
+
+	void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
+		::GetMatrixRegs(regs, N, matrixReg);
+		// TODO
+	}
+
+	void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
+		::GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);
+	}
+	void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
+		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
+		::GetVectorRegs(regs, sz, vectorReg);
+		ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);
+	}
+
 	void IRFrontend::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 
 		GetVectorRegs(regs, sz, vectorReg);
+		int n = GetNumVectorElements(sz);
 		if (js.prefixD == 0)
 			return;
 
-		int n = GetNumVectorElements(sz);
 		for (int i = 0; i < n; i++) {
-			// Hopefully this is rare, we'll just write it into a reg we drop.
+			// Hopefully this is rare, we'll just write it into a dumping ground reg.
 			if (js.VfpuWriteMask(i))
-				regs[i] = fpr.GetTempV();
+				regs[i] = IRVTEMP_PFX_D + i;
 		}
 	}
 
@@ -171,13 +189,12 @@ namespace MIPSComp {
 		for (int i = 0; i < n; i++) {
 			if (js.VfpuWriteMask(i))
 				continue;
-
-			int sat = (js.prefixD >> (i * 2)) & 3;
+			int sat = GetDSat(js.prefixD, i);
 			if (sat == 1) {
 				// clamped = x < 0 ? (x > 1 ? 1 : x) : x [0, 1]
-				ir.Write(IROp::FSat0_1, vfpuBase + voffset[vregs[i]], vfpuBase + voffset[vregs[i]]);
+				ir.Write(IROp::FSat0_1, vregs[i], vregs[i]);
 			} else if (sat == 3) {
-				ir.Write(IROp::FSatMinus1_1, vfpuBase + voffset[vregs[i]], vfpuBase + voffset[vregs[i]]);
+				ir.Write(IROp::FSatMinus1_1, vregs[i], vregs[i]);
 			}
 		}
 	}
@@ -207,7 +224,6 @@ namespace MIPSComp {
 
 		u8 vregs[4];
 		GetVectorRegs(vregs, V_Quad, vt);
-		ApplyVoffset(vregs, 4);  // Translate to memory order
 
 		switch (op >> 26) {
 		case 54: //lv.q
@@ -251,9 +267,11 @@ namespace MIPSComp {
 		if (sz == 4 && IsVectorColumn(vd)) {
 			u8 dregs[4];
 			GetVectorRegs(dregs, sz, vd);
-			ir.Write(IROp::InitVec4, vfpuBase + voffset[dregs[0]], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
+			ir.Write(IROp::InitVec4, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
 		} else if (sz == 1) {
-			ir.Write(IROp::SetConstF, vfpuBase + voffset[vd], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+			u8 dreg;
+			GetVectorRegs(&dreg, V_Single, vd);
+			ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
 		} else {
 			DISABLE;
 		}
@@ -275,7 +293,7 @@ namespace MIPSComp {
 		GetVectorRegs(dregs, sz, vd);
 		int row = vd & 3;
 		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
-		ir.Write(IROp::InitVec4, vfpuBase + voffset[dregs[0]], (int)init);
+		ir.Write(IROp::InitVec4, dregs[0], (int)init);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
@@ -311,7 +329,7 @@ namespace MIPSComp {
 			default:
 				return;
 			}
-			ir.Write(IROp::InitVec4, vfpuBase + voffset[vec[0]], (int)init);
+			ir.Write(IROp::InitVec4, vec[0], (int)init);
 		}
 		return;
 	}
@@ -440,12 +458,14 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
-		if (!js.HasNoPrefix())
+		if (!js.HasUnknownPrefix())
 			DISABLE;
 
-		u8 dreg = _VT;
 		s32 imm = (s32)(s16)(u16)(op & 0xFFFF);
-		ir.Write(IROp::SetConstF, vfpuBase + voffset[dreg], ir.AddConstantFloat((float)imm));
+		u8 dreg;
+		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
+		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));
+		ApplyPrefixD(&dreg, V_Single);
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
index 9b8db0c76a04..7e813a307d81 100644
--- a/Core/MIPS/IR/IRFrontend.h
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -115,19 +115,13 @@ class IRFrontend : public MIPSFrontendInterface {
 	void CompShiftImm(MIPSOpcode op, IROp shiftType, int sa);
 	void CompShiftVar(MIPSOpcode op, IROp shiftType, IROp shiftTypeConst);
 
-	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
+	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg);
 	void ApplyPrefixD(const u8 *vregs, VectorSize sz);
-	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
-		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
-		GetVectorRegs(regs, sz, vectorReg);
-		ApplyPrefixST(regs, js.prefixS, sz);
-	}
-	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
-		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
-		GetVectorRegs(regs, sz, vectorReg);
-		ApplyPrefixST(regs, js.prefixT, sz);
-	}
+	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg);
+	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg);
 	void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg);
+	void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg);
+	void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg);
 
 	// Utils
 	void Comp_ITypeMemLR(MIPSOpcode op, bool load);
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 469c97cf5818..9d41d74fbfdd 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -70,6 +70,12 @@ static const IRMeta irMeta[] = {
 	{ IROp::FDiv, "FDiv", "FFF" },
 	{ IROp::FMov, "FMov", "FF" },
 	{ IROp::FSqrt, "FSqrt", "FF" },
+	{ IROp::FSin, "FSin", "FF" },
+	{ IROp::FCos, "FCos", "FF" },
+	{ IROp::FSqrt, "FSqrt", "FF" },
+	{ IROp::FRSqrt, "FRSqrt", "FF" },
+	{ IROp::FRecip, "FRecip", "FF" },
+	{ IROp::FAsin, "FAsin", "FF" },
 	{ IROp::FNeg, "FNeg", "FF" },
 	{ IROp::FAbs, "FAbs", "FF" },
 	{ IROp::FRound, "FRound", "FF" },
@@ -82,17 +88,12 @@ static const IRMeta irMeta[] = {
 	{ IROp::FSatMinus1_1, "FSat(-1 - 1)", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
-	{ IROp::InitVec4, "InitVec4", "Fv"},
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 
-	{ IROp::FSin, "FSin", "FF" },
-	{ IROp::FCos, "FCos", "FF" },
-	{ IROp::FSqrt, "FSqrt", "FF" },
-	{ IROp::FRSqrt, "FRSqrt", "FF" },
-	{ IROp::FRecip, "FRecip", "FF" },
-	{ IROp::FAsin, "FAsin", "FF" },
+	{ IROp::InitVec4, "InitVec4", "Fv" },
+	{ IROp::ShuffleVec4, "ShuffleVec4", "FFs" },
 
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
@@ -192,6 +193,7 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 		"[0 0 1 0]",
 		"[0 0 0 1]",
 	};
+	static const char *xyzw = "xyzw";
 
 	switch (type) {
 	case 'G':
@@ -216,6 +218,9 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'v':
 		snprintf(buf, bufSize, "%s", initVec4Names[param]);
 		break;
+	case 's':
+		snprintf(buf, bufSize, "%s%s%s%s", xyzw[param & 3], xyzw[(param >> 2) & 3], xyzw[(param >> 4) & 3], xyzw[(param >> 6) & 3]);
+		break;
 	case '_':
 	case '\0':
 		buf[0] = 0;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 322ef2386d3d..e9bbc4acbf3b 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -142,7 +142,11 @@ enum class IROp : u8 {
 
 	SetCtrlVFPU,
 
+	// 4-wide instructions to assist SIMD.
+	// Can of course add a pass to break them up if a target does not
+	// support SIMD.
 	InitVec4,
+	ShuffleVec4,
 
 	// Slow special functions. Used on singles.
 	FSin,
@@ -232,16 +236,21 @@ enum {
 	IRTEMP_LHS,  // Reserved for use in branches
 	IRTEMP_RHS,  // Reserved for use in branches
 
+	IRVTEMP_PFX_S = 224 - 32,  // Relative to the FP regs
+	IRVTEMP_PFX_T = 228 - 32,
+	IRVTEMP_PFX_D = 232 - 32,
+	IRVTEMP_0 = 236 - 32,
+
 	// 16 float temps for vector S and T prefixes and things like that.
 	// IRVTEMP_0 = 208 - 64,  // -64 to be relative to v[0]
 
 	// Hacky way to get to other state
 	IRREG_VFPU_CTRL_BASE = 208,
 	IRREG_VFPU_CC = 211,
-	IRREG_LO = 226,  // offset of lo in MIPSState / 4
-	IRREG_HI = 227,
-	IRREG_FCR31 = 228,
-	IRREG_FPCOND = 229,
+	IRREG_LO = 242,  // offset of lo in MIPSState / 4
+	IRREG_HI = 243,
+	IRREG_FCR31 = 244,
+	IRREG_FPCOND = 245,
 };
 
 struct IRMeta {
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index f77216ef0c37..68740ad0a672 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -144,6 +144,15 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 #endif
 			break;
 
+		case IROp::ShuffleVec4:
+		{
+			// Can't use the SSE shuffle here because it takes an immediate.
+			// Backends with SSE support could use that though.
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
+			break;
+		}
+
 		case IROp::FSin:
 			mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);
 			break;
diff --git a/Core/MIPS/MIPS.h b/Core/MIPS/MIPS.h
index d3a01f1bde31..f51644fd7685 100644
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@@ -172,23 +172,24 @@ class MIPSState
 	// However, the IR interpreter needs some temps that can stick around between ops.
 	// Can be indexed through r[] using indices 192+.
 	u32 t[16];     //192
-	// float vt[16];  //208  TODO: VFPU temp
 
 	// If vfpuCtrl (prefixes) get mysterious values, check the VFPU regcache code.
 	u32 vfpuCtrl[16]; // 208
 
+	float vt[16];  //224  TODO: VFPU temp
+
 	// ARM64 wants lo/hi to be aligned to 64 bits from the base of this struct.
-	u32 padLoHi;    // 224
+	u32 padLoHi;    // 240
 
 	union {
 		struct {
-			u32 pc;   //225
+			u32 pc;   //241
 
-			u32 lo;   //226
-			u32 hi;   //227
+			u32 lo;   //242
+			u32 hi;   //243
 
-			u32 fcr31; //fpu control register
-			u32 fpcond;  // cache the cond flag of fcr31  (& 1 << 23)
+			u32 fcr31; //244 fpu control register
+			u32 fpcond;  //245 cache the cond flag of fcr31  (& 1 << 23)
 		};
 		u32 other[6];
 	};

From 2cbfb192c4bc52e7167e927a9d1402d722134ef7 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 12:17:25 +0200
Subject: [PATCH 49/77] IR: Lots more VFPU support, some with SIMD

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 602 ++++++++++++++++++++++++++++++--
 Core/MIPS/IR/IRFrontend.h       |   4 +-
 Core/MIPS/IR/IRInst.cpp         |  16 +-
 Core/MIPS/IR/IRInst.h           |  19 +-
 Core/MIPS/IR/IRInterpreter.cpp  |  76 +++-
 Core/MIPS/IR/IRJit.h            |   8 +-
 Core/MIPS/IR/IRPassSimplify.cpp |   8 +-
 Core/MIPS/IR/IRPassSimplify.h   |   2 +-
 Core/MIPS/MIPSVFPUUtils.cpp     |   4 +
 Core/MIPS/x86/CompVFPU.cpp      |   3 +-
 10 files changed, 699 insertions(+), 43 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index c9d0083d2eb3..b9620f822b9d 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -65,6 +65,26 @@ namespace MIPSComp {
 			     regs[3] == regs[2] + 1;
 	}
 
+	// Vector regs can overlap in all sorts of swizzled ways.
+	// This does allow a single overlap in sregs[i].
+	static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) {
+		for (int i = 0; i < sn; ++i) {
+			if (sregs[i] == dreg && i != di)
+				return false;
+		}
+		for (int i = 0; i < tn; ++i) {
+			if (tregs[i] == dreg)
+				return false;
+		}
+
+		// Hurray, no overlap, we can write directly.
+		return true;
+	}
+
+	static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) {
+		return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
+	}
+
 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
 		CONDITIONAL_DISABLE;
 		int data = op & 0xFFFFF;
@@ -146,17 +166,19 @@ namespace MIPSComp {
 
 	void IRFrontend::GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
 		::GetMatrixRegs(regs, N, matrixReg);
-		// TODO
+		for (int i = 0; i < GetMatrixSide(N); i++) {
+			ApplyVoffset(regs + 4 * i, GetVectorSize(N));
+		}
 	}
 
 	void IRFrontend::GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) {
 		_assert_(js.prefixSFlag & JitState::PREFIX_KNOWN);
-		::GetVectorRegs(regs, sz, vectorReg);
+		GetVectorRegs(regs, sz, vectorReg);
 		ApplyPrefixST(regs, js.prefixS, sz, IRVTEMP_PFX_S);
 	}
 	void IRFrontend::GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) {
 		_assert_(js.prefixTFlag & JitState::PREFIX_KNOWN);
-		::GetVectorRegs(regs, sz, vectorReg);
+		GetVectorRegs(regs, sz, vectorReg);
 		ApplyPrefixST(regs, js.prefixT, sz, IRVTEMP_PFX_T);
 	}
 
@@ -179,7 +201,8 @@ namespace MIPSComp {
 		return (prefix >> (i * 2)) & 3;
 	}
 
-	// "D" prefix is really a post process. No need to allocate a temporary register.
+	// "D" prefix is really a post process. No need to allocate a temporary register (except
+	// dummies to simulate writemask, which is done in GetVectorRegsPrefixD
 	void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 		if (!js.prefixD)
@@ -263,17 +286,15 @@ namespace MIPSComp {
 		VectorSize sz = GetVecSize(op);
 		int type = (op >> 16) & 0xF;
 		int vd = _VD;
-
+		int n = GetNumVectorElements(sz);
+		u8 dregs[4];
+		GetVectorRegs(dregs, sz, vd);
 		if (sz == 4 && IsVectorColumn(vd)) {
-			u8 dregs[4];
-			GetVectorRegs(dregs, sz, vd);
-			ir.Write(IROp::InitVec4, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
-		} else if (sz == 1) {
-			u8 dreg;
-			GetVectorRegs(&dreg, V_Single, vd);
-			ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+			ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
 		} else {
-			DISABLE;
+			for (int i = 0; i < n; i++) {
+				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
+			}
 		}
 	}
 
@@ -293,7 +314,7 @@ namespace MIPSComp {
 		GetVectorRegs(dregs, sz, vd);
 		int row = vd & 3;
 		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
-		ir.Write(IROp::InitVec4, dregs[0], (int)init);
+		ir.Write(IROp::Vec4Init, dregs[0], (int)init);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
@@ -329,7 +350,7 @@ namespace MIPSComp {
 			default:
 				return;
 			}
-			ir.Write(IROp::InitVec4, vec[0], (int)init);
+			ir.Write(IROp::Vec4Init, vec[0], (int)init);
 		}
 		return;
 	}
@@ -345,24 +366,312 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VDot(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		int vd = _VD;
+		int vs = _VS;
+		int vt = _VT;
+		VectorSize sz = GetVecSize(op);
+
+		// TODO: Force read one of them into regs? probably not.
+		u8 sregs[4], tregs[4], dregs[1];
+		GetVectorRegsPrefixS(sregs, sz, vs);
+		GetVectorRegsPrefixT(tregs, sz, vt);
+		GetVectorRegsPrefixD(dregs, V_Single, vd);
+
+		int temp0 = IRVTEMP_0;
+		int temp1 = IRVTEMP_0 + 1;
+		ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);
+		int n = GetNumVectorElements(sz);
+		for (int i = 1; i < n; i++) {
+			ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
+			ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
+		}
+		ApplyPrefixD(dregs, V_Single);
 	}
 
 	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		// Check that we can support the ops, and prepare temporary values for ops that need it.
+		bool allowSIMD = true;
+		switch (op >> 26) {
+		case 24: //VFPU0
+			switch ((op >> 23) & 7) {
+			case 0: // d[i] = s[i] + t[i]; break; //vadd
+			case 1: // d[i] = s[i] - t[i]; break; //vsub
+			case 7: // d[i] = s[i] / t[i]; break; //vdiv
+				break;
+			default:
+				DISABLE;
+			}
+			break;
+		case 25: //VFPU1
+			switch ((op >> 23) & 7) {
+			case 0: // d[i] = s[i] * t[i]; break; //vmul
+				break;
+			default:
+				DISABLE;
+			}
+			break;
+		case 27: //VFPU3
+			switch ((op >> 23) & 7) {
+			case 2:  // vmin
+			case 3:  // vmax
+				allowSIMD = false;
+				break;
+			case 6:  // vsge
+			case 7:  // vslt
+				allowSIMD = false;
+				break;
+			default:
+				DISABLE;
+			}
+			break;
+		default:
+			DISABLE;
+			break;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], tregs[4], dregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		if (allowSIMD && sz == V_Quad && IsConsecutive4(sregs) && IsConsecutive4(dregs) && IsConsecutive4(sregs)) {
+			IROp opFunc = IROp::Nop;
+			bool symmetric = false;
+			switch (op >> 26) {
+			case 24: //VFPU0
+				switch ((op >> 23) & 7) {
+				case 0: // d[i] = s[i] + t[i]; break; //vadd
+					opFunc = IROp::Vec4Add;
+					symmetric = true;
+					break;
+				case 1: // d[i] = s[i] - t[i]; break; //vsub
+					opFunc = IROp::Vec4Sub;
+					break;
+				case 7: // d[i] = s[i] / t[i]; break; //vdiv
+					opFunc = IROp::Vec4Div;
+					break;
+				}
+				break;
+			case 25: //VFPU1
+				switch ((op >> 23) & 7)
+				{
+				case 0: // d[i] = s[i] * t[i]; break; //vmul
+					opFunc = IROp::Vec4Mul;
+					symmetric = true;
+					break;
+				}
+				break;
+			case 27: //VFPU3
+				switch ((op >> 23) & 7)
+				{
+				case 2:  // vmin
+				case 3:  // vmax
+				case 6:  // vsge
+				case 7:  // vslt
+					DISABLE;
+					break;
+				}
+				break;
+			}
+
+			if (opFunc != IROp::Nop) {
+				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
+			}
+
+			ApplyPrefixD(dregs, sz);
+			return;
+		}
+
+		for (int i = 0; i < n; ++i) {
+			switch (op >> 26) {
+			case 24: //VFPU0
+				switch ((op >> 23) & 7) {
+				case 0: // d[i] = s[i] + t[i]; break; //vadd
+					ir.Write(IROp::FAdd, dregs[i], sregs[i], tregs[i]);
+					break;
+				case 1: // d[i] = s[i] - t[i]; break; //vsub
+					ir.Write(IROp::FSub, dregs[i], sregs[i], tregs[i]);
+					break;
+				case 7: // d[i] = s[i] / t[i]; break; //vdiv
+					ir.Write(IROp::FDiv, dregs[i], sregs[i], tregs[i]);
+					break;
+				}
+				break;
+			case 25: //VFPU1
+				switch ((op >> 23) & 7) {
+				case 0: // d[i] = s[i] * t[i]; break; //vmul
+					ir.Write(IROp::FMul, dregs[i], sregs[i], tregs[i]);
+					break;
+				}
+				break;
+			case 27: //VFPU3
+				switch ((op >> 23) & 7) {
+				case 2:  // vmin
+					ir.Write(IROp::FMin, dregs[i], sregs[i], tregs[i]);
+					break;
+				case 3:  // vmax
+					ir.Write(IROp::FMax, dregs[i], sregs[i], tregs[i]);
+					break;
+				case 6:  // vsge
+				case 7:  // vslt
+					DISABLE;
+					break;
+				}
+				break;
+			}
+		}
+
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
-		CONDITIONAL_DISABLE;
-		// Eliminate silly no-op VMOVs, common in Wipeout Pure
-		if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		int vs = _VS;
+		int vd = _VD;
+
+		// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
+		if (((op >> 16) & 0x1f) == 0 && vs == vd && js.HasNoPrefix()) {
 			return;
 		}
-		DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4];
+		GetVectorRegsPrefixS(sregs, sz, vs);
+		GetVectorRegsPrefixD(dregs, sz, vd);
+
+		bool canSIMD = false;
+		// Some can be SIMD'd.
+		switch ((op >> 16) & 0x1f) {
+		case 0:  // vmov
+			canSIMD = true;
+			break;
+		}
+
+		if (canSIMD && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
+			switch ((op >> 16) & 0x1f) {
+			case 0:  // vmov
+				ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]);
+				break;
+			}
+			ApplyPrefixD(dregs, sz);
+			return;
+		}
+
+		for (int i = 0; i < n; ++i) {
+			switch ((op >> 16) & 0x1f) {
+			case 0: // d[i] = s[i]; break; //vmov
+				// Probably for swizzle.
+				ir.Write(IROp::FMov, dregs[i], sregs[i]);
+				break;
+			case 1: // d[i] = fabsf(s[i]); break; //vabs
+				ir.Write(IROp::FAbs, dregs[i], sregs[i]);
+				break;
+			case 2: // d[i] = -s[i]; break; //vneg
+				ir.Write(IROp::FNeg, dregs[i], sregs[i]);
+				break;
+			case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;    // vsat0
+				ir.Write(IROp::FSat0_1, dregs[i], sregs[i]);
+				break;
+			case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;  // vsat1
+				ir.Write(IROp::FSatMinus1_1, dregs[i], sregs[i]);
+				break;
+			case 16: // d[i] = 1.0f / s[i]; break; //vrcp
+				ir.Write(IROp::FRecip, dregs[i], sregs[i]);
+				break;
+			case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
+				ir.Write(IROp::FRSqrt, dregs[i], sregs[i]);
+				break;
+			case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
+				ir.Write(IROp::FSin, dregs[i], sregs[i]);
+				break;
+			case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
+				ir.Write(IROp::FCos, dregs[i], sregs[i]);
+				break;
+			case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
+				DISABLE;
+				break;
+			case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
+				DISABLE;
+				break;
+			case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
+				ir.Write(IROp::FSqrt, dregs[i], sregs[i]);
+				break;
+			case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
+				ir.Write(IROp::FAsin, dregs[i], sregs[i]);
+				break;
+			case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
+				ir.Write(IROp::FRecip, dregs[i], sregs[i]);
+				ir.Write(IROp::FNeg, dregs[i], dregs[i]);
+				break;
+			case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
+				ir.Write(IROp::FSin, dregs[i], sregs[i]);
+				ir.Write(IROp::FNeg, dregs[i], dregs[i]);
+				break;
+			case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
+			default:
+				DISABLE;
+				break;
+			}
+		}
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		int imm = (op >> 16) & 0x1f;
+		const float mult = 1.0f / (float)(1UL << imm);
+
+		u8 sregs[4], dregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		int tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
+				tempregs[i] = IRVTEMP_PFX_T + i;  // Need IRVTEMP_0 for the scaling factor
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+		if (mult != 1.0f)
+			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(mult));
+		// TODO: Use the SCVTF with builtin scaling where possible.
+		for (int i = 0; i < n; i++) {
+			ir.Write(IROp::FCvtSW, tempregs[i], sregs[i]);
+			if (mult != 1.0f)
+				ir.Write(IROp::FMul, tempregs[i], tempregs[i], IRVTEMP_0);
+		}
+
+		for (int i = 0; i < n; ++i) {
+			if (dregs[i] != tempregs[i]) {
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
@@ -414,11 +723,115 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VScl(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4], treg;
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		// TODO: Prefixes seem strange...
+		GetVectorRegsPrefixT(&treg, V_Single, _VT);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
+			// In this case, there's zero danger of overlap.
+			ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
+			ApplyPrefixD(dregs, sz);
+			return;
+		}
+
+		// For prefixes to work, we just have to ensure that none of the output registers spill
+		// and that there's no overlap.
+		int tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			// for vscl, it's fine if dregs[i] = sregs[i]
+			if (dregs[i] != sregs[i] && !IsOverlapSafe(dregs[i], i, n, sregs)) {
+				// Need to use temp regs
+				tempregs[i] = IRVTEMP_0 + i;
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
+		for (int i = 0; i < n; i++) {
+			ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);
+		}
+
+		for (int i = 0; i < n; i++) {
+			// All must be mapped for prefixes to work.
+			if (dregs[i] != tempregs[i]) {
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
+		ApplyPrefixD(dregs, sz);
 	}
 
+	// This may or may not be a win when using the IR interpreter...
+	// Many more instructions to interpret.
 	void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		MatrixSize sz = GetMtxSize(op);
+		int n = GetMatrixSide(sz);
+
+		MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
+		MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
+
+		u8 sregs[16], tregs[16], dregs[16];
+		GetMatrixRegs(sregs, sz, _VS);
+		GetMatrixRegs(tregs, sz, _VT);
+		GetMatrixRegs(dregs, sz, _VD);
+
+		if (soverlap || toverlap) {
+			DISABLE;
+		}
+
+		if (sz == M_4x4 && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
+			logBlocks = 1;
+			int s0 = IRVTEMP_0;
+			int s1 = IRVTEMP_PFX_T;
+			if (!IsConsecutive4(sregs)) {
+				for (int j = 0; j < 4; j++) {
+					ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);
+					for (int i = 1; i < 4; i++) {
+						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[j * 4 + i]);
+						ir.Write(IROp::Vec4Add, s0, s0, s1);
+					}
+					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
+				}
+				return;
+			} else {
+				for (int j = 0; j < 4; j++) {
+					for (int i = 0; i < 4; i++) {
+						ir.Write(IROp::Vec4Dot, s0 + i, sregs[i], tregs[j * 4]);
+					}
+					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
+				}
+				return;
+			}
+		} else {
+			// logBlocks = 1;
+		}
+
+		int temp0 = IRVTEMP_0;
+		int temp1 = IRVTEMP_0 + 1;
+		for (int a = 0; a < n; a++) {
+			for (int b = 0; b < n; b++) {
+				ir.Write(IROp::FMul, temp0, sregs[b * 4], tregs[a * 4]);
+				for (int c = 1; c < n; c++) {
+					ir.Write(IROp::FMul, temp1, sregs[b * 4 + c], tregs[a * 4 + c]);
+					ir.Write(IROp::FAdd, (c == n - 1) ? dregs[a * 4 + b] : temp0, temp0, temp1);
+				}
+			}
+		}
 	}
 
 	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
@@ -426,7 +839,78 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		MatrixSize msz = GetMtxSize(op);
+		int n = GetNumVectorElements(sz);
+		int ins = (op >> 23) & 7;
+
+		bool homogenous = false;
+		if (n == ins) {
+			n++;
+			sz = (VectorSize)((int)(sz)+1);
+			msz = (MatrixSize)((int)(msz)+1);
+			homogenous = true;
+		}
+		// Otherwise, n should already be ins + 1.
+		else if (n != ins + 1) {
+			DISABLE;
+		}
+
+		u8 sregs[16], dregs[4], tregs[4];
+		GetMatrixRegs(sregs, msz, _VS);
+		GetVectorRegs(tregs, sz, _VT);
+		GetVectorRegs(dregs, sz, _VD);
+
+		// SIMD-optimized implementations
+		if (msz == M_4x4 && !homogenous && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
+			int s0 = IRVTEMP_0;
+			int s1 = IRVTEMP_PFX_T;
+			if (!IsConsecutive4(sregs)) {
+				ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
+				for (int i = 1; i < 4; i++) {
+					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
+					ir.Write(IROp::Vec4Add, s0, s0, s1);
+				}
+				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+				return;
+			} else {
+				for (int i = 0; i < 4; i++) {
+					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i], tregs[0]);
+				}
+				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+				return;
+			}
+		} else if (msz == M_4x4) {
+			logBlocks = 1;
+		}
+
+		// TODO: test overlap, optimize.
+		int tempregs[4];
+		int s0 = IRVTEMP_0;
+		int temp1 = IRVTEMP_0 + 1;
+		for (int i = 0; i < n; i++) {
+			ir.Write(IROp::FMul, s0, sregs[i * 4], tregs[0]);
+			for (int k = 1; k < n; k++) {
+				if (!homogenous || k != n - 1) {
+					ir.Write(IROp::FMul, temp1, sregs[i * 4 + k], tregs[k]);
+					ir.Write(IROp::FAdd, s0, s0, temp1);
+				} else {
+					ir.Write(IROp::FAdd, s0, s0, sregs[i * 4 + k]);
+				}
+			}
+			int temp = IRVTEMP_PFX_T + i;
+			ir.Write(IROp::FMov, temp, s0);
+			tempregs[i] = temp;
+		}
+		for (int i = 0; i < n; i++) {
+			u8 temp = tempregs[i];
+			ir.Write(IROp::FMov, dregs[i], temp);
+		}
 	}
 
 	void IRFrontend::Comp_VCrs(MIPSOpcode op) {
@@ -446,19 +930,53 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], tregs[4], dregs[4];
+		GetVectorRegs(sregs, sz, _VS);
+		GetVectorRegs(tregs, sz, _VT);
+		GetVectorRegs(dregs, sz, _VD);
+
+		if (sz == V_Triple) {
+			int temp0 = IRVTEMP_0;
+			int temp1 = IRVTEMP_0 + 1;
+			// Compute X
+			ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);
+			ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);
+			ir.Write(IROp::FSub, dregs[0], temp0, temp1);
+
+			// Compute Y
+			ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);
+			ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);
+			ir.Write(IROp::FSub, dregs[1], temp0, temp1);
+
+			// Compute Z
+			ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
+			ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
+			ir.Write(IROp::FSub, dregs[2], temp0, temp1);
+		} else if (sz == V_Quad) {
+			DISABLE;
+		}
 	}
 
 	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
+		// Fiendishly hard...
 		DISABLE;
 	}
 
 	void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
+		// Fiendishly hard...
 		DISABLE;
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
-		if (!js.HasUnknownPrefix())
+		if (js.HasUnknownPrefix())
 			DISABLE;
 
 		s32 imm = (s32)(s16)(u16)(op & 0xFFFF);
@@ -469,11 +987,37 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
-		DISABLE;
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		FP16 half;
+		half.u = op & 0xFFFF;
+		FP32 fval = half_to_float_fast5(half);
+
+		u8 dreg;
+		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
+		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));
+		ApplyPrefixD(&dreg, V_Single);
 	}
 
 	void IRFrontend::Comp_Vcst(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		int conNum = (op >> 16) & 0x1f;
+		int vd = _VD;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 dregs[4];
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+		for (int i = 0; i < n; i++) {
+			ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
+		}
+		ApplyPrefixD(dregs, sz);
 	}
 
 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
index 7e813a307d81..7a9a9196120d 100644
--- a/Core/MIPS/IR/IRFrontend.h
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -95,7 +95,9 @@ class IRFrontend : public MIPSFrontendInterface {
 	void ApplyRoundingMode(bool force = false);
 	void UpdateRoundingMode();
 
-	void EatPrefix() { js.EatPrefix(); }
+	void EatPrefix() override {
+		js.EatPrefix();
+	}
 
 	void FlushAll();
 	void FlushPrefixV();
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 9d41d74fbfdd..92494612201d 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -4,6 +4,7 @@
 #include "Core/MIPS/MIPSDebugInterface.h"
 
 static const IRMeta irMeta[] = {
+	{ IROp::Nop, "Nop", "" },
 	{ IROp::SetConst, "SetConst", "GC" },
 	{ IROp::SetConstF, "SetConstF", "FC" },
 	{ IROp::Mov, "Mov", "GG" },
@@ -68,6 +69,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
 	{ IROp::FDiv, "FDiv", "FFF" },
+	{ IROp::FMin, "FMin", "FFF" },
+	{ IROp::FMax, "FMax", "FFF" },
 	{ IROp::FMov, "FMov", "FF" },
 	{ IROp::FSqrt, "FSqrt", "FF" },
 	{ IROp::FSin, "FSin", "FF" },
@@ -92,8 +95,15 @@ static const IRMeta irMeta[] = {
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 
-	{ IROp::InitVec4, "InitVec4", "Fv" },
-	{ IROp::ShuffleVec4, "ShuffleVec4", "FFs" },
+	{ IROp::Vec4Init, "Vec4Init", "Fv" },
+	{ IROp::Vec4Shuffle, "Vec4Shuffle", "FFs" },
+	{ IROp::Vec4Mov, "Vec4Mov", "FF" },
+	{ IROp::Vec4Add, "Vec4Add", "FFF" },
+	{ IROp::Vec4Sub, "Vec4Sub", "FFF" },
+	{ IROp::Vec4Div, "Vec4Div", "FFF" },
+	{ IROp::Vec4Mul, "Vec4Mul", "FFF" },
+	{ IROp::Vec4Scale, "Vec4Scale", "FFF" },
+	{ IROp::Vec4Dot, "Vec4Dot", "FFF" },
 
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
@@ -219,7 +229,7 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 		snprintf(buf, bufSize, "%s", initVec4Names[param]);
 		break;
 	case 's':
-		snprintf(buf, bufSize, "%s%s%s%s", xyzw[param & 3], xyzw[(param >> 2) & 3], xyzw[(param >> 4) & 3], xyzw[(param >> 6) & 3]);
+		snprintf(buf, bufSize, "%c%c%c%c", xyzw[param & 3], xyzw[(param >> 2) & 3], xyzw[(param >> 4) & 3], xyzw[(param >> 6) & 3]);
 		break;
 	case '_':
 	case '\0':
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index e9bbc4acbf3b..c296afb96755 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -16,6 +16,8 @@
 // MIPS->target JITs.
 
 enum class IROp : u8 {
+	Nop,
+
 	SetConst,
 	SetConstF,
 
@@ -103,6 +105,8 @@ enum class IROp : u8 {
 	FSub,
 	FMul,
 	FDiv,
+	FMin,
+	FMax,
 
 	FMov,
 	FSqrt,
@@ -134,6 +138,10 @@ enum class IROp : u8 {
 	FCmpLessUnordered,
 	FCmpLessEqualOrdered,
 	FCmpLessEqualUnordered,
+	FCmpEqualZero,
+	FCmpNotEqualZero,
+
+	FCmovVfpuCC,
 
 	// Rounding Mode
 	RestoreRoundingMode,
@@ -145,8 +153,15 @@ enum class IROp : u8 {
 	// 4-wide instructions to assist SIMD.
 	// Can of course add a pass to break them up if a target does not
 	// support SIMD.
-	InitVec4,
-	ShuffleVec4,
+	Vec4Init,
+	Vec4Shuffle,
+	Vec4Mov,
+	Vec4Add,
+	Vec4Sub,
+	Vec4Mul,
+	Vec4Div,
+	Vec4Scale,
+	Vec4Dot,
 
 	// Slow special functions. Used on singles.
 	FSin,
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 68740ad0a672..4e572745f0a5 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -2,6 +2,9 @@
 #include <smmintrin.h>
 #endif
 
+#include <algorithm>
+#include <cmath>
+
 #include "Core/MemMap.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
@@ -136,7 +139,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 
-		case IROp::InitVec4:
+		case IROp::Vec4Init:
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
 #else
@@ -144,7 +147,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 #endif
 			break;
 
-		case IROp::ShuffleVec4:
+		case IROp::Vec4Shuffle:
 		{
 			// Can't use the SSE shuffle here because it takes an immediate.
 			// Backends with SSE support could use that though.
@@ -153,6 +156,69 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			break;
 		}
 
+		case IROp::Vec4Mov:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
+#else
+			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
+#endif
+			break;
+
+		case IROp::Vec4Add:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Sub:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Mul:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Div:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Scale:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+#endif
+			break;
+
+		// Not quickly implementable on all platforms, unfortunately.
+		case IROp::Vec4Dot:
+		{
+			float dot = mips->f[inst->src1] * mips->f[inst->src2];
+			for (int i = 1; i < 4; i++)
+				dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+			mips->f[inst->dest] = dot;
+			break;
+		}
+
 		case IROp::FSin:
 			mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);
 			break;
@@ -299,6 +365,12 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::FDiv:
 			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
 			break;
+		case IROp::FMin:
+			mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);
+			break;
+		case IROp::FMax:
+			mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);
+			break;
 
 		case IROp::FMov:
 			mips->f[inst->dest] = mips->f[inst->src1];
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index aa026b0bd8d5..87a8231bff18 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -57,10 +57,14 @@ class IRBlock {
 	void SetInstructions(const std::vector<IRInst> &inst, const std::vector<u32> &constants) {
 		instr_ = new IRInst[inst.size()];
 		numInstructions_ = (u16)inst.size();
-		memcpy(instr_, &inst[0], sizeof(IRInst) * inst.size());
+		if (!inst.empty()) {
+			memcpy(instr_, &inst[0], sizeof(IRInst) * inst.size());
+		}
 		const_ = new u32[constants.size()];
 		numConstants_ = (u16)constants.size();
-		memcpy(const_, &constants[0], sizeof(u32) * constants.size());
+		if (!constants.empty()) {
+			memcpy(const_, &constants[0], sizeof(u32) * constants.size());
+		}
 	}
 
 	const IRInst *GetInstructions() const { return instr_; }
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index e846c8420a1c..477774f4355a 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -368,7 +368,13 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
-		case IROp::InitVec4:
+		case IROp::Vec4Init:
+		case IROp::Vec4Add:
+		case IROp::Vec4Sub:
+		case IROp::Vec4Mul:
+		case IROp::Vec4Div:
+		case IROp::Vec4Scale:
+		case IROp::Vec4Shuffle:
 			out.Write(inst);
 			break;
 
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index 5bf3f53fb9eb..72e87ace2150 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -5,4 +5,4 @@
 typedef bool (*IRPassFunc)(const IRWriter &in, IRWriter &out);
 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out);
 
-bool PropagateConstants(const IRWriter &in, IRWriter &out);
\ No newline at end of file
+bool PropagateConstants(const IRWriter &in, IRWriter &out);
diff --git a/Core/MIPS/MIPSVFPUUtils.cpp b/Core/MIPS/MIPSVFPUUtils.cpp
index 385926d5b421..cd9739be0278 100644
--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@@ -395,6 +395,10 @@ MatrixOverlapType GetMatrixOverlap(int mtx1, int mtx2, MatrixSize msize) {
 	if (mtx1 == mtx2)
 		return OVERLAP_EQUAL;
 
+	if (msize == M_4x4) {
+		return (mtx1 == mtx2) ? OVERLAP_EQUAL : OVERLAP_NONE;
+	}
+
 	u8 m1[16];
 	u8 m2[16];
 	GetMatrixRegs(m1, msize, mtx1);
diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index 749967f53a61..ec95760d57e4 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -1801,8 +1801,7 @@ void Jit::Comp_Vf2i(MIPSOpcode op) {
 	const double *mult = &mulTableVf2i[imm];
 
 	int setMXCSR = -1;
-	switch ((op >> 21) & 0x1f)
-	{
+	switch ((op >> 21) & 0x1f) {
 	case 17:
 		break; //z - truncate. Easy to support.
 	case 16:

From cb251ea93fa4296833e4ea7b4eca442bd88bb003 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 12:18:12 +0200
Subject: [PATCH 50/77] Crashfix in savestate (hmmmm...)

---
 Core/SaveState.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Core/SaveState.cpp b/Core/SaveState.cpp
index bf6f30eeeda6..0b4465144c45 100644
--- a/Core/SaveState.cpp
+++ b/Core/SaveState.cpp
@@ -234,9 +234,14 @@ namespace SaveState
 		if (MIPSComp::jit && p.mode == p.MODE_WRITE)
 		{
 			auto blockCache = MIPSComp::jit->GetBlockCache();
-			auto savedBlocks = blockCache->SaveAndClearEmuHackOps();
+			std::vector<u32> savedBlocks;
+			if (blockCache) {
+				 savedBlocks = blockCache->SaveAndClearEmuHackOps();
+			}
 			Memory::DoState(p);
-			blockCache->RestoreSavedEmuHackOps(savedBlocks);
+			if (blockCache) {
+				blockCache->RestoreSavedEmuHackOps(savedBlocks);
+			}
 		}
 		else
 			Memory::DoState(p);

From 182674cddf7f5f1b03703a1bf14d5b8eee03d558 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 13:10:26 +0200
Subject: [PATCH 51/77] IR: SIMD another matrix orientation. Fix various
 issues.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 136 +++++++++++++++++++++++++-----------
 1 file changed, 95 insertions(+), 41 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index b9620f822b9d..1670deb967b2 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -34,7 +34,7 @@
 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
 // Currently known non working ones should have DISABLE.
 
-// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; }
+// #define CONDITIONAL_DISABLE { Comp_Generic(op); return; }
 #define CONDITIONAL_DISABLE ;
 #define DISABLE { Comp_Generic(op); return; }
 
@@ -280,8 +280,9 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
-		if (!js.HasNoPrefix())
+		if (js.HasUnknownPrefix()) {
 			DISABLE;
+		}
 
 		VectorSize sz = GetVecSize(op);
 		int type = (op >> 16) & 0xF;
@@ -299,8 +300,9 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
-		if (!js.HasNoPrefix())
+		if (js.HasUnknownPrefix()) {
 			DISABLE;
+		}
 
 		int vd = _VD;
 		VectorSize sz = GetVecSize(op);
@@ -447,7 +449,18 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		if (allowSIMD && sz == V_Quad && IsConsecutive4(sregs) && IsConsecutive4(dregs) && IsConsecutive4(sregs)) {
+		int tempregs[4];
+		bool usingTemps = false;
+		for (int i = 0; i < n; i++) {
+			if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {
+				tempregs[i] = IRVTEMP_0 + i;
+				usingTemps = true;
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
+		if (allowSIMD && sz == V_Quad && !usingTemps && IsConsecutive4(sregs) && IsConsecutive4(dregs) && IsConsecutive4(sregs)) {
 			IROp opFunc = IROp::Nop;
 			bool symmetric = false;
 			switch (op >> 26) {
@@ -490,7 +503,6 @@ namespace MIPSComp {
 			if (opFunc != IROp::Nop) {
 				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
 			}
-
 			ApplyPrefixD(dregs, sz);
 			return;
 		}
@@ -500,30 +512,30 @@ namespace MIPSComp {
 			case 24: //VFPU0
 				switch ((op >> 23) & 7) {
 				case 0: // d[i] = s[i] + t[i]; break; //vadd
-					ir.Write(IROp::FAdd, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FAdd, tempregs[i], sregs[i], tregs[i]);
 					break;
 				case 1: // d[i] = s[i] - t[i]; break; //vsub
-					ir.Write(IROp::FSub, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FSub, tempregs[i], sregs[i], tregs[i]);
 					break;
 				case 7: // d[i] = s[i] / t[i]; break; //vdiv
-					ir.Write(IROp::FDiv, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FDiv, tempregs[i], sregs[i], tregs[i]);
 					break;
 				}
 				break;
 			case 25: //VFPU1
 				switch ((op >> 23) & 7) {
 				case 0: // d[i] = s[i] * t[i]; break; //vmul
-					ir.Write(IROp::FMul, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FMul, tempregs[i], sregs[i], tregs[i]);
 					break;
 				}
 				break;
 			case 27: //VFPU3
 				switch ((op >> 23) & 7) {
 				case 2:  // vmin
-					ir.Write(IROp::FMin, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FMin, tempregs[i], sregs[i], tregs[i]);
 					break;
 				case 3:  // vmax
-					ir.Write(IROp::FMax, dregs[i], sregs[i], tregs[i]);
+					ir.Write(IROp::FMax, tempregs[i], sregs[i], tregs[i]);
 					break;
 				case 6:  // vsge
 				case 7:  // vslt
@@ -534,6 +546,12 @@ namespace MIPSComp {
 			}
 		}
 
+		for (int i = 0; i < n; i++) {
+			if (dregs[i] != tempregs[i]) {
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
 		ApplyPrefixD(dregs, sz);
 	}
 
@@ -556,6 +574,17 @@ namespace MIPSComp {
 		GetVectorRegsPrefixS(sregs, sz, vs);
 		GetVectorRegsPrefixD(dregs, sz, vd);
 
+		bool usingTemps = false;
+		int tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
+				usingTemps = true;
+				tempregs[i] = IRVTEMP_0 + i;
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
 		bool canSIMD = false;
 		// Some can be SIMD'd.
 		switch ((op >> 16) & 0x1f) {
@@ -564,7 +593,7 @@ namespace MIPSComp {
 			break;
 		}
 
-		if (canSIMD && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
+		if (canSIMD && !usingTemps && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
 			switch ((op >> 16) & 0x1f) {
 			case 0:  // vmov
 				ir.Write(IROp::Vec4Mov, dregs[0], sregs[0]);
@@ -578,31 +607,31 @@ namespace MIPSComp {
 			switch ((op >> 16) & 0x1f) {
 			case 0: // d[i] = s[i]; break; //vmov
 				// Probably for swizzle.
-				ir.Write(IROp::FMov, dregs[i], sregs[i]);
+				ir.Write(IROp::FMov, tempregs[i], sregs[i]);
 				break;
 			case 1: // d[i] = fabsf(s[i]); break; //vabs
-				ir.Write(IROp::FAbs, dregs[i], sregs[i]);
+				ir.Write(IROp::FAbs, tempregs[i], sregs[i]);
 				break;
 			case 2: // d[i] = -s[i]; break; //vneg
-				ir.Write(IROp::FNeg, dregs[i], sregs[i]);
+				ir.Write(IROp::FNeg, tempregs[i], sregs[i]);
 				break;
 			case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;    // vsat0
-				ir.Write(IROp::FSat0_1, dregs[i], sregs[i]);
+				ir.Write(IROp::FSat0_1, tempregs[i], sregs[i]);
 				break;
 			case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;  // vsat1
-				ir.Write(IROp::FSatMinus1_1, dregs[i], sregs[i]);
+				ir.Write(IROp::FSatMinus1_1, tempregs[i], sregs[i]);
 				break;
 			case 16: // d[i] = 1.0f / s[i]; break; //vrcp
-				ir.Write(IROp::FRecip, dregs[i], sregs[i]);
+				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
 				break;
 			case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
-				ir.Write(IROp::FRSqrt, dregs[i], sregs[i]);
+				ir.Write(IROp::FRSqrt, tempregs[i], sregs[i]);
 				break;
 			case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
-				ir.Write(IROp::FSin, dregs[i], sregs[i]);
+				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
 				break;
 			case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
-				ir.Write(IROp::FCos, dregs[i], sregs[i]);
+				ir.Write(IROp::FCos, tempregs[i], sregs[i]);
 				break;
 			case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
 				DISABLE;
@@ -611,18 +640,18 @@ namespace MIPSComp {
 				DISABLE;
 				break;
 			case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
-				ir.Write(IROp::FSqrt, dregs[i], sregs[i]);
+				ir.Write(IROp::FSqrt, tempregs[i], sregs[i]);
 				break;
 			case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
-				ir.Write(IROp::FAsin, dregs[i], sregs[i]);
+				ir.Write(IROp::FAsin, tempregs[i], sregs[i]);
 				break;
 			case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
-				ir.Write(IROp::FRecip, dregs[i], sregs[i]);
-				ir.Write(IROp::FNeg, dregs[i], dregs[i]);
+				ir.Write(IROp::FRecip, tempregs[i], sregs[i]);
+				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
 				break;
 			case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
-				ir.Write(IROp::FSin, dregs[i], sregs[i]);
-				ir.Write(IROp::FNeg, dregs[i], dregs[i]);
+				ir.Write(IROp::FSin, tempregs[i], sregs[i]);
+				ir.Write(IROp::FNeg, tempregs[i], tempregs[i]);
 				break;
 			case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
 			default:
@@ -630,6 +659,12 @@ namespace MIPSComp {
 				break;
 			}
 		}
+		for (int i = 0; i < n; i++) {
+			if (dregs[i] != tempregs[i]) {
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
 		ApplyPrefixD(dregs, sz);
 	}
 
@@ -782,20 +817,32 @@ namespace MIPSComp {
 		MatrixSize sz = GetMtxSize(op);
 		int n = GetMatrixSide(sz);
 
-		MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
-		MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
+		int vs = _VS;
+		int vd = _VD;
+		int vt = _VT;
+		MatrixOverlapType soverlap = GetMatrixOverlap(vs, vd, sz);
+		MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);
+
+		// A very common arrangment. Rearrange to something we can handle.
+		if (IsMatrixTransposed(vd) && !IsMatrixTransposed(vs) && IsMatrixTransposed(vt)) {
+			// Matrix identity says (At * Bt) = (B * A)t
+			// D = S * T
+			// Dt = (S * T)t = (Tt * St)
+			vd = TransposeMatrixReg(vd);
+			std::swap(vs, vt);
+		}
 
 		u8 sregs[16], tregs[16], dregs[16];
-		GetMatrixRegs(sregs, sz, _VS);
-		GetMatrixRegs(tregs, sz, _VT);
-		GetMatrixRegs(dregs, sz, _VD);
+		GetMatrixRegs(sregs, sz, vs);
+		GetMatrixRegs(tregs, sz, vt);
+		GetMatrixRegs(dregs, sz, vd);
 
 		if (soverlap || toverlap) {
 			DISABLE;
 		}
-
 		if (sz == M_4x4 && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
-			logBlocks = 1;
+			// TODO: The interpreter would like proper matrix ops better. Can generate those, and
+			// expand them like this as needed on "real" architectures.
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
 			if (!IsConsecutive4(sregs)) {
@@ -817,10 +864,12 @@ namespace MIPSComp {
 				}
 				return;
 			}
-		} else {
-			// logBlocks = 1;
+		} else if (sz == M_4x4) {
+			// Tekken 6 has a case here: MEE
+			logBlocks = 1;
 		}
 
+		// Fallback. Expands a LOT
 		int temp0 = IRVTEMP_0;
 		int temp1 = IRVTEMP_0 + 1;
 		for (int a = 0; a < n; a++) {
@@ -867,18 +916,23 @@ namespace MIPSComp {
 		GetVectorRegs(dregs, sz, _VD);
 
 		// SIMD-optimized implementations
-		if (msz == M_4x4 && !homogenous && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
+		if (msz == M_4x4 && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
 			if (!IsConsecutive4(sregs)) {
 				ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
 				for (int i = 1; i < 4; i++) {
-					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
-					ir.Write(IROp::Vec4Add, s0, s0, s1);
+					if (!homogenous || (i != n - 1)) {
+						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
+						ir.Write(IROp::Vec4Add, s0, s0, s1);
+					} else {
+						logBlocks = 1;
+						ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
+					}
 				}
 				ir.Write(IROp::Vec4Mov, dregs[0], s0);
 				return;
-			} else {
+			} else if (!homogenous) {
 				for (int i = 0; i < 4; i++) {
 					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i], tregs[0]);
 				}
@@ -886,7 +940,7 @@ namespace MIPSComp {
 				return;
 			}
 		} else if (msz == M_4x4) {
-			logBlocks = 1;
+			// logBlocks = 1;
 		}
 
 		// TODO: test overlap, optimize.

From 850d0abc91fb29aece41586a3c9e35434c7ac494 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 20:16:15 +0200
Subject: [PATCH 52/77] IR: More VFPU. Support normal fp compares.

---
 Core/MIPS/IR/IRCompFPU.cpp      |  23 ++--
 Core/MIPS/IR/IRCompVFPU.cpp     | 225 ++++++++++++++++++++++++++++----
 Core/MIPS/IR/IRFrontend.cpp     |   3 +-
 Core/MIPS/IR/IRFrontend.h       |   8 +-
 Core/MIPS/IR/IRInst.cpp         |   6 +
 Core/MIPS/IR/IRInst.h           |  23 ++--
 Core/MIPS/IR/IRInterpreter.cpp  |  27 ++++
 Core/MIPS/IR/IRPassSimplify.cpp |   8 +-
 8 files changed, 261 insertions(+), 62 deletions(-)

diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index 068a58013a87..f046704ac7c4 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -94,7 +94,7 @@ void IRFrontend::Comp_FPULS(MIPSOpcode op) {
 }
 
 void IRFrontend::Comp_FPUComp(MIPSOpcode op) {
-	DISABLE;  // IROps not yet implemented
+	CONDITIONAL_DISABLE;
 
 	int opc = op & 0xF;
 	if (opc >= 8) opc -= 8; // alias
@@ -105,35 +105,34 @@ void IRFrontend::Comp_FPUComp(MIPSOpcode op) {
 
 	int fs = _FS;
 	int ft = _FT;
-
-	IROp irOp;
+	IRFpCompareMode mode;
 	switch (opc) {
 	case 1:      // un,  ngle (unordered)
-		irOp = IROp::FCmpUnordered;
+		mode = IRFpCompareMode::NotEqualUnordered;
 		break;
 	case 2:      // eq,  seq (equal, ordered)
-		irOp = IROp::FCmpEqual;
+		mode = IRFpCompareMode::EqualOrdered;
 		break;
 	case 3:      // ueq, ngl (equal, unordered)
-		irOp = IROp::FCmpEqualUnordered;
+		mode = IRFpCompareMode::EqualUnordered;
 		return;
 	case 4:      // olt, lt (less than, ordered)
-		irOp = IROp::FCmpLessOrdered;
+		mode = IRFpCompareMode::LessOrdered;
 		break;
 	case 5:      // ult, nge (less than, unordered)
-		irOp = IROp::FCmpLessUnordered;
+		mode = IRFpCompareMode::LessUnordered;
 		break;
 	case 6:      // ole, le (less equal, ordered)
-		irOp = IROp::FCmpLessEqualOrdered;
+		mode = IRFpCompareMode::LessEqualOrdered;
 		break;
 	case 7:      // ule, ngt (less equal, unordered)
-		irOp = IROp::FCmpLessEqualUnordered;
+		mode = IRFpCompareMode::LessEqualUnordered;
 		break;
 	default:
-		Comp_Generic(op);
+		DISABLE;
 		return;
 	}
-	ir.Write(irOp, fs, ft);
+	ir.Write(IROp::FCmp, (int)mode, fs, ft);
 }
 
 void IRFrontend::Comp_FPU2op(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 1670deb967b2..3478114f8ffa 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -81,8 +81,8 @@ namespace MIPSComp {
 		return true;
 	}
 
-	static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) {
-		return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
+	static bool IsOverlapSafe(int dreg, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) {
+		return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);
 	}
 
 	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
@@ -364,7 +364,35 @@ namespace MIPSComp {
 	static const float MEMORY_ALIGNED16(vavg_table[4]) = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
 
 	void IRFrontend::Comp_Vhoriz(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[1];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, V_Single, _VD);
+
+		// We have to start at +0.000 in case any values are -0.000.
+		ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(0.0f));
+		for (int i = 0; i < n; ++i) {
+			ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, sregs[i]);
+		}
+
+		switch ((op >> 16) & 31) {
+		case 6:  // vfad
+			ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
+			break;
+		case 7:  // vavg
+			ir.Write(IROp::SetConstF, IRVTEMP_0 + 1, ir.AddConstantFloat(vavg_table[n - 1]));
+			ir.Write(IROp::FMul, dregs[0], IRVTEMP_0, IRVTEMP_0 + 1);
+			break;
+		}
+
+		ApplyPrefixD(dregs, V_Single);
 	}
 
 	void IRFrontend::Comp_VDot(MIPSOpcode op) {
@@ -397,7 +425,6 @@ namespace MIPSComp {
 
 	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
-
 		if (js.HasUnknownPrefix())
 			DISABLE;
 
@@ -449,10 +476,10 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		int tempregs[4];
+		u8 tempregs[4];
 		bool usingTemps = false;
 		for (int i = 0; i < n; i++) {
-			if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) {
+			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
 				tempregs[i] = IRVTEMP_0 + i;
 				usingTemps = true;
 			} else {
@@ -460,7 +487,7 @@ namespace MIPSComp {
 			}
 		}
 
-		if (allowSIMD && sz == V_Quad && !usingTemps && IsConsecutive4(sregs) && IsConsecutive4(dregs) && IsConsecutive4(sregs)) {
+		if (allowSIMD && sz == V_Quad && !usingTemps && IsConsecutive4(dregs) && IsConsecutive4(sregs) && IsConsecutive4(tregs)) {
 			IROp opFunc = IROp::Nop;
 			bool symmetric = false;
 			switch (op >> 26) {
@@ -502,6 +529,8 @@ namespace MIPSComp {
 
 			if (opFunc != IROp::Nop) {
 				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
+			} else {
+				DISABLE;
 			}
 			ApplyPrefixD(dregs, sz);
 			return;
@@ -558,6 +587,10 @@ namespace MIPSComp {
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
 		if (js.HasUnknownPrefix())
 			DISABLE;
+		if (!js.HasNoPrefix()) {
+			logBlocks = 1;
+			//DISABLE;  // Something subtle is wrong.
+		}
 
 		int vs = _VS;
 		int vd = _VD;
@@ -577,7 +610,7 @@ namespace MIPSComp {
 		bool usingTemps = false;
 		int tempregs[4];
 		for (int i = 0; i < n; ++i) {
-			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
+			if (!IsOverlapSafe(dregs[i], n, sregs)) {
 				usingTemps = true;
 				tempregs[i] = IRVTEMP_0 + i;
 			} else {
@@ -686,7 +719,7 @@ namespace MIPSComp {
 
 		int tempregs[4];
 		for (int i = 0; i < n; ++i) {
-			if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
+			if (!IsOverlapSafe(dregs[i], n, sregs)) {
 				tempregs[i] = IRVTEMP_PFX_T + i;  // Need IRVTEMP_0 for the scaling factor
 			} else {
 				tempregs[i] = dregs[i];
@@ -697,8 +730,11 @@ namespace MIPSComp {
 		// TODO: Use the SCVTF with builtin scaling where possible.
 		for (int i = 0; i < n; i++) {
 			ir.Write(IROp::FCvtSW, tempregs[i], sregs[i]);
-			if (mult != 1.0f)
+		}
+		if (mult != 1.0f) {
+			for (int i = 0; i < n; i++) {
 				ir.Write(IROp::FMul, tempregs[i], tempregs[i], IRVTEMP_0);
+			}
 		}
 
 		for (int i = 0; i < n; ++i) {
@@ -718,6 +754,8 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Mftv(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
+
 		int imm = op & 0xFF;
 		MIPSGPReg rt = _RT;
 		switch ((op >> 21) & 0x1f) {
@@ -727,7 +765,18 @@ namespace MIPSComp {
 				if (imm < 128) {  //R(rt) = VI(imm);
 					ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
 				} else {
-					DISABLE;
+					switch (imm - 128) {
+					case VFPU_CTRL_DPREFIX:
+					case VFPU_CTRL_SPREFIX:
+					case VFPU_CTRL_TPREFIX:
+						FlushPrefixV();
+						break;
+					}
+					if (imm - 128 < 16) {
+						ir.Write(IROp::VfpuCtrlToReg, rt, imm - 128);
+					} else {
+						DISABLE;
+					}
 				}
 			}
 			break;
@@ -735,6 +784,8 @@ namespace MIPSComp {
 		case 7: // mtv
 			if (imm < 128) {
 				ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[imm], rt);
+			} else if ((imm - 128) < 16) {
+				ir.Write(IROp::SetCtrlVFPU, imm - 128, rt);
 			} else {
 				DISABLE;
 			}
@@ -743,18 +794,105 @@ namespace MIPSComp {
 		default:
 			DISABLE;
 		}
+		// This op is marked not to auto-eat prefix so we must do it manually.
+		EatPrefix();
 	}
 
+	// Good above
+
 	void IRFrontend::Comp_Vmfvc(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+
+		int vs = _VS;
+		int imm = op & 0xFF;
+		if (imm >= 128 && imm < 128 + VFPU_CTRL_MAX) {
+			//if (imm - 128 == VFPU_CTRL_CC) {
+			//	gpr.MapReg(MIPS_REG_VFPUCC, 0);
+			//	fp.FMOV(fpr.V(vs), gpr.R(MIPS_REG_VFPUCC));
+			// } else {
+			ir.Write(IROp::VfpuCtrlToReg, IRTEMP_0, imm - 128);
+			ir.Write(IROp::FMovFromGPR, vfpuBase + voffset[vs], IRTEMP_0);
+		}
 	}
 
 	void IRFrontend::Comp_Vmtvc(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		int vs = _VS;
+		int imm = op & 0xFF;
+		if (imm >= 128 && imm < 128 + VFPU_CTRL_MAX) {
+			ir.Write(IROp::SetCtrlVFPUFReg, imm - 128, vfpuBase + voffset[vs]);
+			if (imm - 128 == VFPU_CTRL_SPREFIX) {
+				js.prefixSFlag = JitState::PREFIX_UNKNOWN;
+			} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
+				js.prefixTFlag = JitState::PREFIX_UNKNOWN;
+			} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
+				js.prefixDFlag = JitState::PREFIX_UNKNOWN;
+			}
+		}
 	}
 
 	void IRFrontend::Comp_Vmmov(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
+
+		int vs = _VS;
+		int vd = _VD;
+		// This probably ignores prefixes for all sane intents and purposes.
+		if (vs == vd) {
+			// A lot of these no-op matrix moves in Wipeout... Just drop the instruction entirely.
+			return;
+		}
+
+		MatrixSize sz = GetMtxSize(op);
+		if (sz != M_4x4) {
+			// logBlocks = true;
+			DISABLE;
+		}
+		int n = GetMatrixSide(sz);
+
+		u8 sregs[16], dregs[16];
+		GetMatrixRegs(sregs, sz, vs);
+		GetMatrixRegs(dregs, sz, vd);
+
+		// Rough overlap check.
+		switch (GetMatrixOverlap(vs, vd, sz)) {
+		case OVERLAP_EQUAL:
+			// In-place transpose
+			DISABLE;
+		case OVERLAP_PARTIAL:
+			DISABLE;
+		case OVERLAP_NONE:
+		default:
+			break;
+		}
+		if (IsMatrixTransposed(vd) == IsMatrixTransposed(vs) && sz == M_4x4) {
+			// Untranspose both matrices
+			if (IsMatrixTransposed(vd)) {
+				vd = TransposeMatrixReg(vd);
+				vs = TransposeMatrixReg(vs);
+			}
+			// Get the columns
+			u8 scols[4], dcols[4];
+			GetMatrixColumns(vs, sz, scols);
+			GetMatrixColumns(vd, sz, dcols);
+			for (int i = 0; i < 4; i++) {
+				u8 svec[4], dvec[4];
+				GetVectorRegs(svec, GetVectorSize(sz), scols[i]);
+				GetVectorRegs(dvec, GetVectorSize(sz), dcols[i]);
+				ir.Write(IROp::Vec4Mov, dvec[0], svec[0]);
+			}
+			return;
+		}
+		for (int a = 0; a < n; a++) {
+			for (int b = 0; b < n; b++) {
+				ir.Write(IROp::FMov, dregs[a * 4 + b], sregs[a * 4 + b]);
+			}
+		}
+	}
+
+	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
 		DISABLE;
+
+		// TODO: Tricky, can transpose
 	}
 
 	void IRFrontend::Comp_VScl(MIPSOpcode op) {
@@ -772,26 +910,27 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(&treg, V_Single, _VT);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
-			// In this case, there's zero danger of overlap.
-			ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
-			ApplyPrefixD(dregs, sz);
-			return;
-		}
-
+		bool overlap = false;
 		// For prefixes to work, we just have to ensure that none of the output registers spill
 		// and that there's no overlap.
 		int tempregs[4];
 		for (int i = 0; i < n; ++i) {
-			// for vscl, it's fine if dregs[i] = sregs[i]
-			if (dregs[i] != sregs[i] && !IsOverlapSafe(dregs[i], i, n, sregs)) {
+			// Conservative, can be improved
+			if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {
 				// Need to use temp regs
 				tempregs[i] = IRVTEMP_0 + i;
+				overlap = true;
 			} else {
 				tempregs[i] = dregs[i];
 			}
 		}
 
+		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs) && !overlap) {
+			ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
+			ApplyPrefixD(dregs, sz);
+			return;
+		}
+
 		for (int i = 0; i < n; i++) {
 			ir.Write(IROp::FMul, tempregs[i], sregs[i], treg);
 		}
@@ -866,7 +1005,7 @@ namespace MIPSComp {
 			}
 		} else if (sz == M_4x4) {
 			// Tekken 6 has a case here: MEE
-			logBlocks = 1;
+			// logBlocks = 1;
 		}
 
 		// Fallback. Expands a LOT
@@ -883,10 +1022,6 @@ namespace MIPSComp {
 		}
 	}
 
-	void IRFrontend::Comp_Vmscl(MIPSOpcode op) {
-		DISABLE;
-	}
-
 	void IRFrontend::Comp_Vtfm(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix()) {
@@ -926,7 +1061,6 @@ namespace MIPSComp {
 						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
 						ir.Write(IROp::Vec4Add, s0, s0, s1);
 					} else {
-						logBlocks = 1;
 						ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
 					}
 				}
@@ -1085,7 +1219,40 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Vocp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4];
+		// Actually, not sure that this instruction accepts an S prefix. We don't apply it in the
+		// interpreter. But whatever.
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+
+		int tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			if (!IsOverlapSafe(dregs[i], n, sregs)) {
+				tempregs[i] = IRVTEMP_PFX_T;   // using IRTEMP0 for other things
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
+		ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(1.0f));
+		for (int i = 0; i < n; ++i) {
+			ir.Write(IROp::FSub, tempregs[i], IRVTEMP_0, sregs[i]);
+		}
+		for (int i = 0; i < n; ++i) {
+			if (dregs[i] != tempregs[i]) {
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		}
+
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 3b13978b43bf..0a3fc0432909 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -34,7 +34,8 @@ namespace MIPSComp {
 IRFrontend::IRFrontend(bool startDefaultPrefix) {
 	logBlocks = 0;
 	dontLogBlocks = 0;
-	js.startDefaultPrefix = startDefaultPrefix;
+	js.startDefaultPrefix = true;
+	js.hasSetRounding = false;
 	// js.currentRoundingFunc = convertS0ToSCRATCH1[0];
 }
 
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
index 7a9a9196120d..8ffdf5c9047a 100644
--- a/Core/MIPS/IR/IRFrontend.h
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -90,15 +90,15 @@ class IRFrontend : public MIPSFrontendInterface {
 
 	void DoJit(u32 em_address, std::vector<IRInst> &instructions, std::vector<u32> &constants);
 
+	void EatPrefix() override {
+		js.EatPrefix();
+	}
+
 private:
 	void RestoreRoundingMode(bool force = false);
 	void ApplyRoundingMode(bool force = false);
 	void UpdateRoundingMode();
 
-	void EatPrefix() override {
-		js.EatPrefix();
-	}
-
 	void FlushAll();
 	void FlushPrefixV();
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 92494612201d..74bf24dc4a99 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -87,6 +87,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FFloor, "FFloor", "FF" },
 	{ IROp::FCvtWS, "FCvtWS", "FF" },
 	{ IROp::FCvtSW, "FCvtSW", "FF" },
+	{ IROp::FCmp, "FCmp", "mFF" },
 	{ IROp::FSat0_1, "FSat(0 - 1)", "FF" },
 	{ IROp::FSatMinus1_1, "FSat(-1 - 1)", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
@@ -94,6 +95,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
+	{ IROp::SetCtrlVFPUReg, "SetCtrlVFPUReg", "TC" },
+	{ IROp::SetCtrlVFPUFReg, "SetCtrlVFPUFReg", "TF" },
 
 	{ IROp::Vec4Init, "Vec4Init", "Fv" },
 	{ IROp::Vec4Shuffle, "Vec4Shuffle", "FFs" },
@@ -222,6 +225,9 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, const u32 *co
 	case 'I':
 		snprintf(buf, bufSize, "%02x", param);
 		break;
+	case 'm':
+		snprintf(buf, bufSize, "%d", param);
+		break;
 	case 'T':
 		snprintf(buf, bufSize, "%s", vfpuCtrlNames[param]);
 		break;
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index c296afb96755..0393eb9d22d7 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -131,15 +131,7 @@ enum class IROp : u8 {
 	VfpuCtrlToReg,
 
 	ZeroFpCond,
-	FCmpUnordered,
-	FCmpEqual,
-	FCmpEqualUnordered,
-	FCmpLessOrdered,
-	FCmpLessUnordered,
-	FCmpLessEqualOrdered,
-	FCmpLessEqualUnordered,
-	FCmpEqualZero,
-	FCmpNotEqualZero,
+	FCmp,
 
 	FCmovVfpuCC,
 
@@ -149,6 +141,8 @@ enum class IROp : u8 {
 	UpdateRoundingMode,
 
 	SetCtrlVFPU,
+	SetCtrlVFPUReg,
+	SetCtrlVFPUFReg,
 
 	// 4-wide instructions to assist SIMD.
 	// Can of course add a pass to break them up if a target does not
@@ -245,6 +239,17 @@ inline IROp ComparisonToExit(IRComparison comp) {
 	}
 }
 
+enum IRFpCompareMode {
+	False = 0,
+	NotEqualUnordered,
+	EqualOrdered, // eq,  seq (equal, ordered)
+	EqualUnordered, // ueq, ngl (equal, unordered)
+	LessOrdered, // olt, lt (less than, ordered)
+	LessUnordered, // ult, nge (less than, unordered)
+	LessEqualOrdered, // ole, le (less equal, ordered)
+	LessEqualUnordered, // ule, ngt (less equal, unordered)
+};
+
 enum {
 	IRTEMP_0 = 192,
 	IRTEMP_1,
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 4e572745f0a5..63ac1afb99e1 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -421,6 +421,25 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		case IROp::FFloor:
 			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1]);
 			break;
+		case IROp::FCmp:
+			switch (inst->dest) {
+			case IRFpCompareMode::False:
+				mips->fpcond = 0;
+				break;
+			case IRFpCompareMode::EqualOrdered:
+			case IRFpCompareMode::EqualUnordered:
+				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];
+				break;
+			case IRFpCompareMode::LessEqualOrdered:
+			case IRFpCompareMode::LessEqualUnordered:
+				mips->fpcond = mips->f[inst->src1] <= mips->f[inst->src2];
+				break;
+			case IRFpCompareMode::LessOrdered:
+			case IRFpCompareMode::LessUnordered:
+				mips->fpcond = mips->f[inst->src1] < mips->f[inst->src2];
+				break;
+			}
+			break;
 
 		case IROp::FCvtSW:
 			mips->f[inst->dest] = (float)mips->fs[inst->src1];
@@ -529,6 +548,14 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			mips->vfpuCtrl[inst->dest] = constPool[inst->src1];
 			break;
 
+		case IROp::SetCtrlVFPUReg:
+			mips->vfpuCtrl[inst->dest] = mips->r[inst->src1];
+			break;
+
+		case IROp::SetCtrlVFPUFReg:
+			memcpy(&mips->vfpuCtrl[inst->dest], &mips->f[inst->src1], 4);
+			break;
+
 		default:
 			Crash();
 		}
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 477774f4355a..d18c92761a00 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -379,13 +379,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 
 		case IROp::ZeroFpCond:
-		case IROp::FCmpUnordered:
-		case IROp::FCmpEqual:
-		case IROp::FCmpEqualUnordered:
-		case IROp::FCmpLessOrdered:
-		case IROp::FCmpLessUnordered:
-		case IROp::FCmpLessEqualOrdered:
-		case IROp::FCmpLessEqualUnordered:
+		case IROp::FCmp:
 			gpr.MapDirty(IRREG_FPCOND);
 			goto doDefault;
 

From c69a8c07dc9b577c82c29e19dd4f62f791551954 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 20:20:59 +0200
Subject: [PATCH 53/77] Forgot this

---
 Core/MIPS/IR/IRCompVFPU.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 3478114f8ffa..8ca4336c2b1a 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -587,10 +587,6 @@ namespace MIPSComp {
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
 		if (js.HasUnknownPrefix())
 			DISABLE;
-		if (!js.HasNoPrefix()) {
-			logBlocks = 1;
-			//DISABLE;  // Something subtle is wrong.
-		}
 
 		int vs = _VS;
 		int vd = _VD;

From 1851458628fd5de7f76ebcaa823af499d997fe5e Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 20:28:59 +0200
Subject: [PATCH 54/77] Bugfixes

---
 Core/MIPS/ARM64/Arm64CompVFPU.cpp |  2 ++
 Core/MIPS/IR/IRCompVFPU.cpp       | 23 +++++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/ARM64/Arm64CompVFPU.cpp b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
index 91500507ae49..3cd5f03d7b90 100644
--- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
@@ -1297,6 +1297,8 @@ namespace MIPSComp {
 	}
 
 	void Arm64Jit::Comp_Vi2x(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
+
 		if (!cpu_info.bNEON) {
 			DISABLE;
 		}
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 8ca4336c2b1a..ed2ab8d89b54 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1127,26 +1127,41 @@ namespace MIPSComp {
 		GetVectorRegs(tregs, sz, _VT);
 		GetVectorRegs(dregs, sz, _VD);
 
+		int tempregs[4];
+		for (int i = 0; i < n; ++i) {
+			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
+				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
+			} else {
+				tempregs[i] = dregs[i];
+			}
+		}
+
 		if (sz == V_Triple) {
 			int temp0 = IRVTEMP_0;
 			int temp1 = IRVTEMP_0 + 1;
 			// Compute X
 			ir.Write(IROp::FMul, temp0, sregs[1], tregs[2]);
 			ir.Write(IROp::FMul, temp1, sregs[2], tregs[1]);
-			ir.Write(IROp::FSub, dregs[0], temp0, temp1);
+			ir.Write(IROp::FSub, tempregs[0], temp0, temp1);
 
 			// Compute Y
 			ir.Write(IROp::FMul, temp0, sregs[2], tregs[0]);
 			ir.Write(IROp::FMul, temp1, sregs[0], tregs[2]);
-			ir.Write(IROp::FSub, dregs[1], temp0, temp1);
+			ir.Write(IROp::FSub, tempregs[1], temp0, temp1);
 
 			// Compute Z
 			ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
 			ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
-			ir.Write(IROp::FSub, dregs[2], temp0, temp1);
+			ir.Write(IROp::FSub, tempregs[2], temp0, temp1);
 		} else if (sz == V_Quad) {
 			DISABLE;
 		}
+
+		for (int i = 0; i < n; i++) {
+			if (tempregs[i] != dregs[i])
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+		}
+		// No D prefix supported
 	}
 
 	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
@@ -1232,7 +1247,7 @@ namespace MIPSComp {
 		int tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
-				tempregs[i] = IRVTEMP_PFX_T;   // using IRTEMP0 for other things
+				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
 			} else {
 				tempregs[i] = dregs[i];
 			}

From 7268abec611ba76a4ad218ecbcb537068fc0aa93 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 12 May 2016 22:35:31 +0200
Subject: [PATCH 55/77] IR: vcmp, vcmov, vhdp

---
 Core/MIPS/IR/IRCompVFPU.cpp    | 93 +++++++++++++++++++++++++++++++---
 Core/MIPS/IR/IRInst.cpp        |  4 +-
 Core/MIPS/IR/IRInst.h          |  8 +++
 Core/MIPS/IR/IRInterpreter.cpp | 46 +++++++++++++++++
 4 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index ed2ab8d89b54..f4736be521ee 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -358,7 +358,38 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VHdp(MIPSOpcode op) {
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		int vd = _VD;
+		int vs = _VS;
+		int vt = _VT;
+		VectorSize sz = GetVecSize(op);
+
+		// TODO: Force read one of them into regs? probably not.
+		u8 sregs[4], tregs[4], dregs[1];
+		GetVectorRegsPrefixS(sregs, sz, vs);
+		GetVectorRegsPrefixT(tregs, sz, vt);
+		GetVectorRegsPrefixD(dregs, V_Single, vd);
+
+		// TODO: applyprefixST here somehow (shuffle, etc...)
+		ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);
+
+		int n = GetNumVectorElements(sz);
+		for (int i = 1; i < n; i++) {
+			// sum += s[i]*t[i];
+			if (i == n - 1) {
+				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, tregs[i]);
+			} else {
+				ir.Write(IROp::FMul, IRVTEMP_0 + 1, sregs[i], tregs[i]);
+				ir.Write(IROp::FAdd, IRVTEMP_0, IRVTEMP_0, IRVTEMP_0 + 1);
+			}
+		}
+
+		ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
+		ApplyPrefixD(dregs, V_Single);
 	}
 
 	static const float MEMORY_ALIGNED16(vavg_table[4]) = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
@@ -840,7 +871,6 @@ namespace MIPSComp {
 
 		MatrixSize sz = GetMtxSize(op);
 		if (sz != M_4x4) {
-			// logBlocks = true;
 			DISABLE;
 		}
 		int n = GetMatrixSide(sz);
@@ -1165,16 +1195,66 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Vcmp(MIPSOpcode op) {
-		// Fiendishly hard...
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix())
+			DISABLE;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		VCondition cond = (VCondition)(op & 0xF);
+
+		u8 sregs[4], tregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+
+		int mask = 0;
+		for (int i = 0; i < n; i++) {
+			ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
+			mask |= (1 << i);
+		}
+		ir.Write(IROp::FCmpVfpuAggregate, mask);
 	}
 
 	void IRFrontend::Comp_Vcmov(MIPSOpcode op) {
-		// Fiendishly hard...
-		DISABLE;
+		CONDITIONAL_DISABLE;
+		if (js.HasUnknownPrefix()) {
+			DISABLE;
+		}
+
+		logBlocks = 1;
+
+		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
+
+		u8 sregs[4], dregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, sz, _VD);
+		int tf = (op >> 19) & 1;
+		int imm3 = (op >> 16) & 7;
+
+		for (int i = 0; i < n; ++i) {
+			// Simplification: Disable if overlap unsafe
+			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
+				DISABLE;
+			}
+		}
+		if (imm3 < 6) {
+			// Test one bit of CC. This bit decides whether none or all subregisters are copied.
+			for (int i = 0; i < n; i++) {
+				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (imm3) | ((!tf) << 7));
+			}
+		} else {
+			// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
+			for (int i = 0; i < n; i++) {
+				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));
+			}
+		}
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix())
 			DISABLE;
 
@@ -1186,6 +1266,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix())
 			DISABLE;
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 74bf24dc4a99..38b8f54e2d2a 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -97,7 +97,9 @@ static const IRMeta irMeta[] = {
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 	{ IROp::SetCtrlVFPUReg, "SetCtrlVFPUReg", "TC" },
 	{ IROp::SetCtrlVFPUFReg, "SetCtrlVFPUFReg", "TF" },
-
+	{ IROp::FCmovVfpuCC, "FCmovVfpuCC", "FFI" },
+	{ IROp::FCmpVfpuBit, "FCmpVfpuBit", "IFF" },
+	{ IROp::FCmpVfpuAggregate, "FCmpVfpuAggregate", ""},
 	{ IROp::Vec4Init, "Vec4Init", "Fv" },
 	{ IROp::Vec4Shuffle, "Vec4Shuffle", "FFs" },
 	{ IROp::Vec4Mov, "Vec4Mov", "FF" },
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 0393eb9d22d7..df434eb113ea 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -134,6 +134,8 @@ enum class IROp : u8 {
 	FCmp,
 
 	FCmovVfpuCC,
+	FCmpVfpuBit,
+	FCmpVfpuAggregate,
 
 	// Rounding Mode
 	RestoreRoundingMode,
@@ -157,6 +159,12 @@ enum class IROp : u8 {
 	Vec4Scale,
 	Vec4Dot,
 
+	// vx2i
+	Vec4ExpandU16ToU32Hi,
+	Vec4ExpandU8ToU32Hi,
+	Vec4ExpandS16ToS32Hi,
+	Vec4ExpandS8ToS32Hi,
+
 	// Slow special functions. Used on singles.
 	FSin,
 	FCos,
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 63ac1afb99e1..1a719046cdff 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -32,6 +32,9 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 	const IRInst *end = inst + count;
 	while (inst != end) {
 		switch (inst->op) {
+		case IROp::Nop:
+			_assert_(false);
+			break;
 		case IROp::SetConst:
 			mips->r[inst->dest] = constPool[inst->src1];
 			break;
@@ -209,6 +212,49 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 #endif
 			break;
 
+		case IROp::FCmpVfpuBit:
+		{
+			int op = inst->dest & 0xF;
+			int bit = inst->dest >> 4;
+			int result = 0;
+			switch (op) {
+			case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break;
+			case VC_NE: result = mips->f[inst->src1] != mips->f[inst->src2]; break;
+			case VC_LT: result = mips->f[inst->src1] < mips->f[inst->src2]; break;
+			case VC_LE: result = mips->f[inst->src1] <= mips->f[inst->src2]; break;
+			case VC_GT: result = mips->f[inst->src1] > mips->f[inst->src2]; break;
+			case VC_GE: result = mips->f[inst->src1] >= mips->f[inst->src2]; break;
+			case VC_EZ: result = mips->f[inst->src1] == 0.0f; break;
+			case VC_NZ: result = mips->f[inst->src1] != 0.0f; break;
+			case VC_TR: result = 1; break;
+			case VC_FL: result = 0; break;
+			default:
+				result = 0;
+			}
+			if (result != 0) {
+				mips->vfpuCtrl[VFPU_CTRL_CC] |= (1 << bit);
+			} else {
+				mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit);
+			}
+		}
+			break;
+
+		case IROp::FCmpVfpuAggregate:
+		{
+			int mask = inst->dest;
+			u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
+			int a = (cc & mask) ? 0x10 : 0x00;
+			int b = (cc & mask) == mask ? 0x20 : 0x00;
+			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b;;
+		}
+			break;
+
+		case IROp::FCmovVfpuCC:
+			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0x7f)) & 1) == (inst->src2 >> 7)) {
+				mips->f[inst->dest] = mips->f[inst->src1];
+			}
+			break;
+
 		// Not quickly implementable on all platforms, unfortunately.
 		case IROp::Vec4Dot:
 		{

From 99468c6fc157c173a537b53d9d514f8617460d94 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 9 May 2016 21:40:46 -0700
Subject: [PATCH 56/77] jit-ir: Optimize out unused temp regs.

This way, if constants have made the temp obsolete (common with ins, for
example), it won't even get set anymore.
---
 Core/MIPS/IR/IRFrontend.cpp     |  1 +
 Core/MIPS/IR/IRInst.cpp         | 10 ++---
 Core/MIPS/IR/IRInst.h           |  4 ++
 Core/MIPS/IR/IRPassSimplify.cpp | 76 +++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRPassSimplify.h   |  1 +
 5 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 0a3fc0432909..3e6c0e755d6f 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -233,6 +233,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 	if (true) {
 		static const IRPassFunc passes[] = {
 			&PropagateConstants,
+			&PurgeTemps,
 		};
 		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 38b8f54e2d2a..b95d5fafa3fd 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -60,11 +60,11 @@ static const IRMeta irMeta[] = {
 	{ IROp::Load32, "Load32", "GGC" },
 	{ IROp::LoadFloat, "LoadFloat", "FGC" },
 	{ IROp::LoadVec4, "LoadVec4", "FGC" },
-	{ IROp::Store8, "Store8", "GGC" },
-	{ IROp::Store16, "Store16", "GGC" },
-	{ IROp::Store32, "Store32", "GGC" },
-	{ IROp::StoreFloat, "StoreFloat", "FGC" },
-	{ IROp::StoreVec4, "StoreVec4", "FGC" },
+	{ IROp::Store8, "Store8", "GGC", IRFLAG_SRC3 },
+	{ IROp::Store16, "Store16", "GGC", IRFLAG_SRC3 },
+	{ IROp::Store32, "Store32", "GGC", IRFLAG_SRC3 },
+	{ IROp::StoreFloat, "StoreFloat", "FGC", IRFLAG_SRC3 },
+	{ IROp::StoreVec4, "StoreVec4", "FGC", IRFLAG_SRC3 },
 	{ IROp::FAdd, "FAdd", "FFF" },
 	{ IROp::FSub, "FSub", "FFF" },
 	{ IROp::FMul, "FMul", "FFF" },
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index df434eb113ea..b4e330aadc6a 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -281,6 +281,10 @@ enum {
 	IRREG_FPCOND = 245,
 };
 
+enum IRFlags {
+	IRFLAG_SRC3 = 1,
+};
+
 struct IRMeta {
 	IROp op;
 	const char *name;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index d18c92761a00..abb06fe89c2f 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -433,3 +433,79 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	}
 	return logBlocks;
 }
+
+bool IRReadsFromGPR(const IRInst &inst, int reg) {
+	const IRMeta *m = GetIRMeta(inst.op);
+
+	if (m->types[1] == 'G' && inst.src1 == reg) {
+		return true;
+	}
+	if (m->types[2] == 'G' && inst.src2 == reg) {
+		return true;
+	}
+	if ((m->flags & IRFLAG_SRC3) != 0 && m->types[0] == 'G' && inst.src3 == reg) {
+		return true;
+	}
+	if (inst.op == IROp::Interpret) {
+		return true;
+	}
+	return false;
+}
+
+int IRDestGPR(const IRInst &inst) {
+	const IRMeta *m = GetIRMeta(inst.op);
+
+	if ((m->flags & IRFLAG_SRC3) == 0 && m->types[0] == 'G') {
+		return inst.dest;
+	}
+	return -1;
+}
+
+bool PurgeTemps(const IRWriter &in, IRWriter &out) {
+	IRRegCache gpr(&out);
+
+	for (u32 value : in.GetConstants()) {
+		out.AddConstant(value);
+	}
+
+	bool logBlocks = false;
+	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
+		const IRInst &inst = in.GetInstructions()[i];
+
+		int dest = IRDestGPR(inst);
+		bool read = true;
+		switch (dest) {
+		case IRTEMP_0:
+		case IRTEMP_1:
+		case IRTEMP_LHS:
+		case IRTEMP_RHS:
+			// Unlike other ops, these don't need to persist between blocks.
+			// So we consider them not read unless proven read.
+			read = false;
+			for (int j = i + 1; j < n; j++) {
+				const IRInst &laterInst = in.GetInstructions()[j];
+				if (IRReadsFromGPR(laterInst, dest)) {
+					// Read from, so we can't optimize out.
+					read = true;
+					break;
+				}
+				if (IRDestGPR(laterInst) == dest) {
+					// Clobbered, we can optimize out.
+					break;
+				}
+			}
+			break;
+
+		default:
+			break;
+		}
+
+		// TODO: VFPU temps?
+
+		if (read) {
+			out.Write(inst);
+		}
+	}
+
+	return logBlocks;
+}
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index 72e87ace2150..496f4e6aad86 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -6,3 +6,4 @@ typedef bool (*IRPassFunc)(const IRWriter &in, IRWriter &out);
 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out);
 
 bool PropagateConstants(const IRWriter &in, IRWriter &out);
+bool PurgeTemps(const IRWriter &in, IRWriter &out);

From d06c6c080cf9b7d53aba958efc30681722eba42d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 9 May 2016 22:25:35 -0700
Subject: [PATCH 57/77] jit-ir: Expand unused regs to regular GPRs.

---
 Core/MIPS/IR/IRInst.cpp         | 26 +++++++++++++-------------
 Core/MIPS/IR/IRInst.h           |  7 ++++++-
 Core/MIPS/IR/IRPassSimplify.cpp | 24 +++++++++++++++++++++---
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index b95d5fafa3fd..11db5bf1bf53 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -33,8 +33,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::SltU, "SltU", "GGG" },
 	{ IROp::SltUConst, "SltUConst", "GGC" },
 	{ IROp::Clz, "Clz", "GG" },
-	{ IROp::MovZ, "MovZ", "GGG" },
-	{ IROp::MovNZ, "MovNZ", "GGG" },
+	{ IROp::MovZ, "MovZ", "GGG", IRFLAG_SRC3DST },
+	{ IROp::MovNZ, "MovNZ", "GGG", IRFLAG_SRC3DST },
 	{ IROp::Max, "Max", "GGG" },
 	{ IROp::Min, "Min", "GGG" },
 	{ IROp::BSwap16, "BSwap16", "GG" },
@@ -112,19 +112,19 @@ static const IRMeta irMeta[] = {
 
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
-	{ IROp::ExitToConst, "Exit", "C" },
-	{ IROp::ExitToConstIfEq, "ExitIfEq", "CGG" },
-	{ IROp::ExitToConstIfNeq, "ExitIfNeq", "CGG" },
-	{ IROp::ExitToConstIfGtZ, "ExitIfGtZ", "CG" },
-	{ IROp::ExitToConstIfGeZ, "ExitIfGeZ", "CG" },
-	{ IROp::ExitToConstIfLeZ, "ExitIfLeZ", "CG" },
-	{ IROp::ExitToConstIfLtZ, "ExitIfLtZ", "CG" },
-	{ IROp::ExitToReg, "ExitToReg", "G" },
-	{ IROp::Syscall, "Syscall", "_C" },
-	{ IROp::Break, "Break", ""},
+	{ IROp::ExitToConst, "Exit", "C", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfEq, "ExitIfEq", "CGG", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfNeq, "ExitIfNeq", "CGG", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfGtZ, "ExitIfGtZ", "CG", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfGeZ, "ExitIfGeZ", "CG", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfLeZ, "ExitIfLeZ", "CG", IRFLAG_EXIT },
+	{ IROp::ExitToConstIfLtZ, "ExitIfLtZ", "CG", IRFLAG_EXIT },
+	{ IROp::ExitToReg, "ExitToReg", "G", IRFLAG_EXIT | IRFLAG_SRC3 },
+	{ IROp::Syscall, "Syscall", "_C", IRFLAG_EXIT },
+	{ IROp::Break, "Break", "", IRFLAG_EXIT},
 	{ IROp::SetPC, "SetPC", "_G" },
 	{ IROp::SetPCConst, "SetPC", "_C" },
-	{ IROp::CallReplacement, "CallRepl", "_C"},
+	{ IROp::CallReplacement, "CallRepl", "_C" },
 };
 
 const IRMeta *metaIndex[256];
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index b4e330aadc6a..d2195892ba82 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -282,7 +282,12 @@ enum {
 };
 
 enum IRFlags {
-	IRFLAG_SRC3 = 1,
+	// Uses src3, not dest.
+	IRFLAG_SRC3 = 0x0001,
+	// Uses src3 AND dest (i.e. mutates dest.)
+	IRFLAG_SRC3DST = 0x0002,
+	// Exit instruction (maybe conditional.)
+	IRFLAG_EXIT = 0x0004,
 };
 
 struct IRMeta {
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index abb06fe89c2f..f33afef6acc7 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -443,10 +443,10 @@ bool IRReadsFromGPR(const IRInst &inst, int reg) {
 	if (m->types[2] == 'G' && inst.src2 == reg) {
 		return true;
 	}
-	if ((m->flags & IRFLAG_SRC3) != 0 && m->types[0] == 'G' && inst.src3 == reg) {
+	if ((m->flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && m->types[0] == 'G' && inst.src3 == reg) {
 		return true;
 	}
-	if (inst.op == IROp::Interpret) {
+	if (inst.op == IROp::Interpret || inst.op == IROp::CallReplacement) {
 		return true;
 	}
 	return false;
@@ -474,6 +474,7 @@ bool PurgeTemps(const IRWriter &in, IRWriter &out) {
 
 		int dest = IRDestGPR(inst);
 		bool read = true;
+		bool readByExit = true;
 		switch (dest) {
 		case IRTEMP_0:
 		case IRTEMP_1:
@@ -482,21 +483,38 @@ bool PurgeTemps(const IRWriter &in, IRWriter &out) {
 			// Unlike other ops, these don't need to persist between blocks.
 			// So we consider them not read unless proven read.
 			read = false;
+			readByExit = false;
+			// Intentional fall-through.
+
+		default:
+			if (dest > IRTEMP_RHS) {
+				// These might sometimes be implicitly read/written by other instructions.
+				break;
+			}
 			for (int j = i + 1; j < n; j++) {
 				const IRInst &laterInst = in.GetInstructions()[j];
+				const IRMeta *m = GetIRMeta(laterInst.op);
 				if (IRReadsFromGPR(laterInst, dest)) {
 					// Read from, so we can't optimize out.
 					read = true;
 					break;
 				}
+				if (readByExit && (m->flags & IRFLAG_EXIT) != 0) {
+					read = true;
+					break;
+				}
+
 				if (IRDestGPR(laterInst) == dest) {
 					// Clobbered, we can optimize out.
+					// This happens sometimes with temporaries used for constant addresses.
+					read = false;
 					break;
 				}
 			}
 			break;
 
-		default:
+		// Not a GPR output.
+		case -1:
 			break;
 		}
 

From 9f183c97ba53c392ea1fd4ac7bb43fad8dd2584b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 18:27:52 -0700
Subject: [PATCH 58/77] jit-ir: Prevent reading ahead for each reg write.

---
 Core/MIPS/IR/IRPassSimplify.cpp | 131 +++++++++++++++++++++++++-------
 Core/MIPS/IR/IRPassSimplify.h   |   1 +
 2 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index f33afef6acc7..cdb766eeceaa 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -462,19 +462,45 @@ int IRDestGPR(const IRInst &inst) {
 }
 
 bool PurgeTemps(const IRWriter &in, IRWriter &out) {
-	IRRegCache gpr(&out);
+	std::vector<IRInst> insts;
+	insts.reserve(in.GetInstructions().size());
 
-	for (u32 value : in.GetConstants()) {
-		out.AddConstant(value);
-	}
+	struct Check {
+		Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) {
+		}
+
+		int reg;
+		int index;
+		bool readByExit;
+	};
+	std::vector<Check> checks;
 
 	bool logBlocks = false;
 	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
 		const IRInst &inst = in.GetInstructions()[i];
+		const IRMeta *m = GetIRMeta(inst.op);
+
+		for (Check &check : checks) {
+			if (check.reg == 0) {
+				continue;
+			}
+
+			if (IRReadsFromGPR(inst, check.reg)) {
+				// Read from, so we can't optimize out.
+				check.reg = 0;
+			} else if (check.readByExit && (m->flags & IRFLAG_EXIT) != 0) {
+				check.reg = 0;
+			} else if (IRDestGPR(inst) == check.reg) {
+				// Clobbered, we can optimize out.
+				// This happens sometimes with temporaries used for constant addresses.
+				insts[check.index].op = IROp::Mov;
+				insts[check.index].dest = 0;
+				insts[check.index].src1 = 0;
+				check.reg = 0;
+			}
+		}
 
 		int dest = IRDestGPR(inst);
-		bool read = true;
-		bool readByExit = true;
 		switch (dest) {
 		case IRTEMP_0:
 		case IRTEMP_1:
@@ -482,45 +508,98 @@ bool PurgeTemps(const IRWriter &in, IRWriter &out) {
 		case IRTEMP_RHS:
 			// Unlike other ops, these don't need to persist between blocks.
 			// So we consider them not read unless proven read.
-			read = false;
-			readByExit = false;
-			// Intentional fall-through.
+			checks.push_back(Check(dest, i, false));
+			break;
 
 		default:
 			if (dest > IRTEMP_RHS) {
 				// These might sometimes be implicitly read/written by other instructions.
 				break;
 			}
+			checks.push_back(Check(dest, i, true));
+			break;
+
+		// Not a GPR output.
+		case 0:
+		case -1:
+			break;
+		}
+
+		// TODO: VFPU temps?
+
+		insts.push_back(inst);
+	}
+
+	for (Check &check : checks) {
+		if (!check.readByExit && check.reg > 0) {
+			insts[check.index].op = IROp::Mov;
+			insts[check.index].dest = 0;
+			insts[check.index].src1 = 0;
+		}
+	}
+
+	for (u32 value : in.GetConstants()) {
+		out.AddConstant(value);
+	}
+	for (const IRInst &inst : insts) {
+		if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) {
+			out.Write(inst);
+		}
+	}
+
+	return logBlocks;
+}
+
+bool ReduceLoads(const IRWriter &in, IRWriter &out) {
+	for (u32 value : in.GetConstants()) {
+		out.AddConstant(value);
+	}
+
+	// This tells us to skip an AND op that has been optimized out.
+	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
+	int nextSkip = -1;
+
+	bool logBlocks = false;
+	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
+		IRInst inst = in.GetInstructions()[i];
+
+		if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) {
+			int dest = IRDestGPR(inst);
 			for (int j = i + 1; j < n; j++) {
 				const IRInst &laterInst = in.GetInstructions()[j];
 				const IRMeta *m = GetIRMeta(laterInst.op);
-				if (IRReadsFromGPR(laterInst, dest)) {
-					// Read from, so we can't optimize out.
-					read = true;
+
+				if ((m->flags & IRFLAG_EXIT) != 0) {
+					// Exit, so we can't do the optimization.
 					break;
 				}
-				if (readByExit && (m->flags & IRFLAG_EXIT) != 0) {
-					read = true;
+				if (IRReadsFromGPR(laterInst, dest)) {
+					if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) {
+						const u32 mask = in.GetConstants()[laterInst.src2];
+						// Here we are, maybe we can reduce the load size based on the mask.
+						if ((mask & 0xffffff00) == 0) {
+							inst.op = IROp::Load8;
+							if (mask == 0xff) {
+								nextSkip = j;
+							}
+						} else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) {
+							inst.op = IROp::Load16;
+							if (mask == 0xffff) {
+								nextSkip = j;
+							}
+						}
+					}
+					// If it was read, we can't do the optimization.
 					break;
 				}
-
 				if (IRDestGPR(laterInst) == dest) {
-					// Clobbered, we can optimize out.
-					// This happens sometimes with temporaries used for constant addresses.
-					read = false;
+					// Someone else wrote, so we can't do the optimization.
 					break;
 				}
 			}
-			break;
-
-		// Not a GPR output.
-		case -1:
-			break;
 		}
 
-		// TODO: VFPU temps?
-
-		if (read) {
+		if (i != nextSkip) {
 			out.Write(inst);
 		}
 	}
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index 496f4e6aad86..d6dd041259c2 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -7,3 +7,4 @@ bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWri
 
 bool PropagateConstants(const IRWriter &in, IRWriter &out);
 bool PurgeTemps(const IRWriter &in, IRWriter &out);
+bool ReduceLoads(const IRWriter &in, IRWriter &out);

From 29ed8d22012342f0dc1c57190bf4c17447e18537 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 18:34:27 -0700
Subject: [PATCH 59/77] jit-ir: ExitToReg doesn't write to registers.

---
 Core/MIPS/IR/IRCompBranch.cpp  | 2 +-
 Core/MIPS/IR/IRFrontend.cpp    | 2 +-
 Core/MIPS/IR/IRInst.cpp        | 2 +-
 Core/MIPS/IR/IRInterpreter.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 76833bf32906..3dd14cef5152 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -363,7 +363,7 @@ void IRFrontend::Comp_JumpReg(MIPSOpcode op) {
 		break;
 	}
 
-	ir.Write(IROp::ExitToReg, destReg, 0, 0);
+	ir.Write(IROp::ExitToReg, 0, destReg, 0);
 	js.compiling = false;
 }
 
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 3e6c0e755d6f..5c154c7df980 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -143,7 +143,7 @@ void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
 		} else {
 			ApplyRoundingMode();
 			ir.Write(IROp::Downcount, 0, js.downcountAmount & 0xFF, js.downcountAmount >> 8);
-			ir.Write(IROp::ExitToReg, MIPS_REG_RA, 0, 0);
+			ir.Write(IROp::ExitToReg, 0, MIPS_REG_RA, 0);
 			js.compiling = false;
 		}
 	} else {
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 11db5bf1bf53..983a4e6a3a12 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -119,7 +119,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::ExitToConstIfGeZ, "ExitIfGeZ", "CG", IRFLAG_EXIT },
 	{ IROp::ExitToConstIfLeZ, "ExitIfLeZ", "CG", IRFLAG_EXIT },
 	{ IROp::ExitToConstIfLtZ, "ExitIfLtZ", "CG", IRFLAG_EXIT },
-	{ IROp::ExitToReg, "ExitToReg", "G", IRFLAG_EXIT | IRFLAG_SRC3 },
+	{ IROp::ExitToReg, "ExitToReg", "_G", IRFLAG_EXIT },
 	{ IROp::Syscall, "Syscall", "_C", IRFLAG_EXIT },
 	{ IROp::Break, "Break", "", IRFLAG_EXIT},
 	{ IROp::SetPC, "SetPC", "_G" },
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 1a719046cdff..cd5ea342831c 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -523,7 +523,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			return constPool[inst->dest];
 
 		case IROp::ExitToReg:
-			return mips->r[inst->dest];
+			return mips->r[inst->src1];
 
 		case IROp::ExitToConstIfEq:
 			if (mips->r[inst->src1] == mips->r[inst->src2])

From a8126ca1321e82ebf0aa2db358161e8cf0febd25 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 20:56:47 -0700
Subject: [PATCH 60/77] jit-ir: Add some missing CONDITIONAL_DISABLEs.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index f4736be521ee..57433a4d11dc 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -223,6 +223,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_SV(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		s32 offset = (signed short)(op & 0xFFFC);
 		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
 		MIPSGPReg rs = _RS;
@@ -241,6 +242,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_SVQ(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		int imm = (signed short)(op & 0xFFFC);
 		int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5);
 		MIPSGPReg rs = _RS;
@@ -280,6 +282,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VVectorInit(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix()) {
 			DISABLE;
 		}
@@ -300,6 +303,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix()) {
 			DISABLE;
 		}
@@ -320,6 +324,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		MatrixSize sz = GetMtxSize(op);
 		if (sz != M_4x4) {
 			DISABLE;
@@ -616,6 +621,7 @@ namespace MIPSComp {
 	}
 
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
+		CONDITIONAL_DISABLE;
 		if (js.HasUnknownPrefix())
 			DISABLE;
 

From 57b3dbff7e800550517f24fbf44fe511a42263e9 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 20:57:10 -0700
Subject: [PATCH 61/77] jit-ir: Avoid flushing on a few Vec4 ops.

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 2 +-
 Core/MIPS/IR/IRPassSimplify.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 57433a4d11dc..6caefc6e290b 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -85,7 +85,7 @@ namespace MIPSComp {
 		return IsOverlapSafeAllowS(dreg, -1, sn, sregs, tn, tregs);
 	}
 
-	void IRFrontend::Comp_VPFX(MIPSOpcode op)	{
+	void IRFrontend::Comp_VPFX(MIPSOpcode op) {
 		CONDITIONAL_DISABLE;
 		int data = op & 0xFFFFF;
 		int regnum = (op >> 24) & 3;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index cdb766eeceaa..d604f6531475 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -369,10 +369,12 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			break;
 
 		case IROp::Vec4Init:
+		case IROp::Vec4Mov:
 		case IROp::Vec4Add:
 		case IROp::Vec4Sub:
 		case IROp::Vec4Mul:
 		case IROp::Vec4Div:
+		case IROp::Vec4Dot:
 		case IROp::Vec4Scale:
 		case IROp::Vec4Shuffle:
 			out.Write(inst);
@@ -392,6 +394,8 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 			gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
 			goto doDefault;
 
+		case IROp::CallReplacement:
+		case IROp::Break:
 		case IROp::Syscall:
 		case IROp::Interpret:
 		case IROp::ExitToConst:

From 1ddb2fbfb99f73cf8363a15ca018ff9993961235 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 21:02:56 -0700
Subject: [PATCH 62/77] jit-ir: Fix non-SSE Vec4Scale.

---
 Core/MIPS/IR/IRInterpreter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index cd5ea342831c..4141bc66da6e 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -208,7 +208,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
 #else
 			for (int i = 0; i < 4; i++)
-				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2];
 #endif
 			break;
 

From c11c0465decc8f5e4bb26ce9ce478dc7a27f2de7 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 21:25:57 -0700
Subject: [PATCH 63/77] jir-ir: Correct vftm SIMD regs.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 6caefc6e290b..3d22a608d3e5 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1082,25 +1082,25 @@ namespace MIPSComp {
 		GetVectorRegs(tregs, sz, _VT);
 		GetVectorRegs(dregs, sz, _VD);
 
-		// SIMD-optimized implementations
-		if (msz == M_4x4 && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
+		// SIMD-optimized implementations - if sregs[0..3] is consecutive, the rest are too.
+		if (msz == M_4x4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
-			if (!IsConsecutive4(sregs)) {
+			if (!IsConsecutive4(tregs)) {
 				ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
 				for (int i = 1; i < 4; i++) {
 					if (!homogenous || (i != n - 1)) {
-						ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
+						ir.Write(IROp::Vec4Scale, s1, sregs[i * 4], tregs[i]);
 						ir.Write(IROp::Vec4Add, s0, s0, s1);
 					} else {
-						ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
+						ir.Write(IROp::Vec4Add, s0, s0, sregs[i * 4]);
 					}
 				}
 				ir.Write(IROp::Vec4Mov, dregs[0], s0);
 				return;
 			} else if (!homogenous) {
 				for (int i = 0; i < 4; i++) {
-					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i], tregs[0]);
+					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[0]);
 				}
 				ir.Write(IROp::Vec4Mov, dregs[0], s0);
 				return;

From f52120353b6ad4fc9b03bf79b6958f2749032f15 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 22:29:31 -0700
Subject: [PATCH 64/77] jit-ir: Apply prefixes for vector init ops.

Without this, Gods Eater Burst is horribly broken.
---
 Core/MIPS/IR/IRCompVFPU.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 3d22a608d3e5..9d2dc7c1fb5c 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -292,14 +292,16 @@ namespace MIPSComp {
 		int vd = _VD;
 		int n = GetNumVectorElements(sz);
 		u8 dregs[4];
-		GetVectorRegs(dregs, sz, vd);
-		if (sz == 4 && IsVectorColumn(vd)) {
+		GetVectorRegsPrefixD(dregs, sz, vd);
+
+		if (sz == V_Quad && IsConsecutive4(dregs)) {
 			ir.Write(IROp::Vec4Init, dregs[0], (int)(type == 6 ? Vec4Init::AllZERO : Vec4Init::AllONE));
 		} else {
 			for (int i = 0; i < n; i++) {
 				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
 			}
 		}
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
@@ -313,14 +315,16 @@ namespace MIPSComp {
 		if (sz != V_Quad)
 			DISABLE;
 
-		if (!IsVectorColumn(vd))
-			DISABLE;
-
 		u8 dregs[4];
-		GetVectorRegs(dregs, sz, vd);
+		GetVectorRegsPrefixD(dregs, sz, vd);
+		if (!IsConsecutive4(dregs)) {
+			DISABLE;
+		}
 		int row = vd & 3;
+		// Might not be consecutive if masked.
 		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
 		ir.Write(IROp::Vec4Init, dregs[0], (int)init);
+		ApplyPrefixD(dregs, sz);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
@@ -794,7 +798,7 @@ namespace MIPSComp {
 		switch ((op >> 21) & 0x1f) {
 		case 3: //mfv / mfvc
 						// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
-			if (rt != 0) {
+			if (rt != MIPS_REG_ZERO) {
 				if (imm < 128) {  //R(rt) = VI(imm);
 					ir.Write(IROp::FMovToGPR, rt, vfpuBase + voffset[imm]);
 				} else {

From 9e3572dc63b76c7eb06ad02efeb7e86786667e2a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 22:38:56 -0700
Subject: [PATCH 65/77] jit-ir: Improve vidt to handle more cases.

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 30 +++++++++++++++++++++---------
 Core/MIPS/IR/IRPassSimplify.cpp |  2 +-
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 9d2dc7c1fb5c..ce41c19712ec 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -312,18 +312,30 @@ namespace MIPSComp {
 
 		int vd = _VD;
 		VectorSize sz = GetVecSize(op);
-		if (sz != V_Quad)
-			DISABLE;
-
 		u8 dregs[4];
 		GetVectorRegsPrefixD(dregs, sz, vd);
-		if (!IsConsecutive4(dregs)) {
-			DISABLE;
+
+		if (sz == 4 && IsConsecutive4(dregs)) {
+			int row = vd & 3;
+			Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
+			ir.Write(IROp::Vec4Init, dregs[0], (int)init);
+		} else {
+			switch (sz) {
+			case V_Pair:
+				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 1) == 0 ? 1.0f : 0.0f));
+				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 1) == 1 ? 1.0f : 0.0f));
+				break;
+			case V_Quad:
+				ir.Write(IROp::SetConstF, dregs[0], ir.AddConstantFloat((vd & 3) == 0 ? 1.0f : 0.0f));
+				ir.Write(IROp::SetConstF, dregs[1], ir.AddConstantFloat((vd & 3) == 1 ? 1.0f : 0.0f));
+				ir.Write(IROp::SetConstF, dregs[2], ir.AddConstantFloat((vd & 3) == 2 ? 1.0f : 0.0f));
+				ir.Write(IROp::SetConstF, dregs[3], ir.AddConstantFloat((vd & 3) == 3 ? 1.0f : 0.0f));
+				break;
+			default:
+				DISABLE;
+			}
 		}
-		int row = vd & 3;
-		// Might not be consecutive if masked.
-		Vec4Init init = Vec4Init((int)Vec4Init::Set_1000 + row);
-		ir.Write(IROp::Vec4Init, dregs[0], (int)init);
+
 		ApplyPrefixD(dregs, sz);
 	}
 
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index d604f6531475..de97635ae8ae 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -529,7 +529,7 @@ bool PurgeTemps(const IRWriter &in, IRWriter &out) {
 			break;
 		}
 
-		// TODO: VFPU temps?
+		// TODO: VFPU temps?  Especially for masked dregs.
 
 		insts.push_back(inst);
 	}

From 7b43e0e59d561c8b410e9224ceffc4b784ce0904 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 12 May 2016 22:53:21 -0700
Subject: [PATCH 66/77] jit-ir: Add nan/inf compares.

Without this, Gods Eater Burst crashes before going in game.
---
 Core/MIPS/IR/IRInterpreter.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 4141bc66da6e..2b2368ec8c8a 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -226,6 +226,12 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 			case VC_GE: result = mips->f[inst->src1] >= mips->f[inst->src2]; break;
 			case VC_EZ: result = mips->f[inst->src1] == 0.0f; break;
 			case VC_NZ: result = mips->f[inst->src1] != 0.0f; break;
+			case VC_EN: result = my_isnan(mips->f[inst->src1]); break;
+			case VC_NN: result = !my_isnan(mips->f[inst->src1]); break;
+			case VC_EI: result = my_isinf(mips->f[inst->src1]); break;
+			case VC_NI: result = !my_isinf(mips->f[inst->src1]); break;
+			case VC_ES: result = my_isnanorinf(mips->f[inst->src1]); break;
+			case VC_NS: result = !my_isnanorinf(mips->f[inst->src1]); break;
 			case VC_TR: result = 1; break;
 			case VC_FL: result = 0; break;
 			default:

From 066b0b7fdfc18e70896935ef199eda13f60d04a1 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 13 May 2016 07:59:39 -0700
Subject: [PATCH 67/77] jit-ir: Optimize out beql; break; sequences.

These are often used following divs, and are harmless.  Things get a bit
easier if we just never compile them.
---
 Core/MIPS/IR/IRCompBranch.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index 3dd14cef5152..a6f8b93cd086 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -47,11 +47,12 @@
 
 #define LOOPOPTIMIZATION 0
 
+#define MIPS_IS_BREAK(op) (((op) & 0xFC00003F) == 13)
+
 using namespace MIPSAnalyst;
 
 namespace MIPSComp
 {
-	using namespace Arm64Gen;
 
 void IRFrontend::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	if (js.inDelaySlot) {
@@ -66,6 +67,16 @@ void IRFrontend::BranchRSRTComp(MIPSOpcode op, IRComparison cc, bool likely) {
 	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs);
 
+	// Often, div/divu are followed by a likely "break" if the divisor was zero.
+	// Stalling is not really useful for us, so we optimize this out.
+	if (likely && offset == 4 && MIPS_IS_BREAK(delaySlotOp)) {
+		// Okay, let's not actually branch at all.  We're done here.
+		EatInstruction(delaySlotOp);
+		// Let's not double-count the downcount, though.
+		js.downcountAmount--;
+		return;
+	}
+
 	int dcAmount = js.downcountAmount;
 	ir.Write(IROp::Downcount, 0, dcAmount & 0xFF, dcAmount >> 8);
 	js.downcountAmount = 0;
@@ -136,7 +147,7 @@ void IRFrontend::BranchRSZeroComp(MIPSOpcode op, IRComparison cc, bool andLink,
 }
 
 void IRFrontend::Comp_RelBranch(MIPSOpcode op) {
-	// The CC flags here should be opposite of the actual branch becuase they skip the branching action.
+	// The CC flags here should be opposite of the actual branch because they skip the branching action.
 	switch (op >> 26) {
 	case 4: BranchRSRTComp(op, IRComparison::NotEqual, false); break;//beq
 	case 5: BranchRSRTComp(op, IRComparison::Equal, false); break;//bne

From f636b2a315cff83fcfc150af61536c4f02d5d831 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 19:31:27 +0200
Subject: [PATCH 68/77] Minor build and other fixes

---
 Core/MIPS/IR/IRCompBranch.cpp    | 1 -
 Core/MIPS/IR/IRCompLoadStore.cpp | 2 --
 Core/MIPS/IR/IRCompVFPU.cpp      | 7 ++-----
 Core/MIPS/IR/IRFrontend.cpp      | 4 ++--
 Core/MIPS/IR/IRInst.h            | 1 +
 Core/MIPS/IR/IRInterpreter.cpp   | 8 ++++----
 Core/MIPS/IR/IRPassSimplify.cpp  | 2 ++
 7 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index a6f8b93cd086..f7b875100df4 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -192,7 +192,6 @@ void IRFrontend::BranchFPFlag(MIPSOpcode op, IRComparison cc, bool likely) {
 	int offset = _IMM16 << 2;
 	u32 targetAddr = GetCompilerPC() + offset + 4;
 
-	MIPSOpcode delaySlotOp = GetOffsetInstruction(1);
 	ir.Write(IROp::FpCondToReg, IRTEMP_LHS);
 	if (!likely)
 		CompileDelaySlot();
diff --git a/Core/MIPS/IR/IRCompLoadStore.cpp b/Core/MIPS/IR/IRCompLoadStore.cpp
index b890f4ff6808..c57f1ec3235f 100644
--- a/Core/MIPS/IR/IRCompLoadStore.cpp
+++ b/Core/MIPS/IR/IRCompLoadStore.cpp
@@ -73,7 +73,6 @@ namespace MIPSComp {
 		CONDITIONAL_DISABLE;
 
 		int offset = (signed short)(op & 0xFFFF);
-		bool load = false;
 		MIPSGPReg rt = _RT;
 		MIPSGPReg rs = _RS;
 		int o = op >> 26;
@@ -113,7 +112,6 @@ namespace MIPSComp {
 
 		case 34: //lwl
 		case 38: //lwr
-			load = true;
 		case 42: //swl
 		case 46: //swr
 			DISABLE;
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index ce41c19712ec..74590a213a28 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -541,13 +541,11 @@ namespace MIPSComp {
 
 		if (allowSIMD && sz == V_Quad && !usingTemps && IsConsecutive4(dregs) && IsConsecutive4(sregs) && IsConsecutive4(tregs)) {
 			IROp opFunc = IROp::Nop;
-			bool symmetric = false;
 			switch (op >> 26) {
 			case 24: //VFPU0
 				switch ((op >> 23) & 7) {
 				case 0: // d[i] = s[i] + t[i]; break; //vadd
 					opFunc = IROp::Vec4Add;
-					symmetric = true;
 					break;
 				case 1: // d[i] = s[i] - t[i]; break; //vsub
 					opFunc = IROp::Vec4Sub;
@@ -562,7 +560,6 @@ namespace MIPSComp {
 				{
 				case 0: // d[i] = s[i] * t[i]; break; //vmul
 					opFunc = IROp::Vec4Mul;
-					symmetric = true;
 					break;
 				}
 				break;
@@ -1053,7 +1050,7 @@ namespace MIPSComp {
 			}
 		} else if (sz == M_4x4) {
 			// Tekken 6 has a case here: MEE
-			// logBlocks = 1;
+			logBlocks = 1;
 		}
 
 		// Fallback. Expands a LOT
@@ -1244,7 +1241,7 @@ namespace MIPSComp {
 			DISABLE;
 		}
 
-		logBlocks = 1;
+		// logBlocks = 1;
 
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index 5c154c7df980..d615f936f986 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -257,7 +257,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		ILOG("=============== Original IR (%d instructions, %d const) ===============", (int)ir.GetInstructions().size(), (int)ir.GetConstants().size());
-		for (int i = 0; i < ir.GetInstructions().size(); i++) {
+		for (size_t i = 0; i < ir.GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
 			ILOG("%s", buf);
@@ -267,7 +267,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 
 	if (logBlocks > 0 && dontLogBlocks == 0) {
 		ILOG("=============== IR (%d instructions, %d const) ===============", (int)code->GetInstructions().size(), (int)code->GetConstants().size());
-		for (int i = 0; i < code->GetInstructions().size(); i++) {
+		for (size_t i = 0; i < code->GetInstructions().size(); i++) {
 			char buf[256];
 			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
 			ILOG("%s", buf);
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index d2195892ba82..b33bec732eb5 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <vector>
+#include <utility>
 
 #include "Common/CommonTypes.h"
 #include "Core/MIPS/MIPS.h"
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 2b2368ec8c8a..c9d2b4ed1857 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -146,7 +146,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 #if defined(_M_SSE)
 			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
 #else
-			memcpy(&mips->f[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
+			memcpy(&mips->f[inst->dest], vec4InitValues[inst->src1], 4 * sizeof(float));
 #endif
 			break;
 
@@ -247,16 +247,16 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 
 		case IROp::FCmpVfpuAggregate:
 		{
-			int mask = inst->dest;
+			u32 mask = inst->dest;
 			u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
 			int a = (cc & mask) ? 0x10 : 0x00;
 			int b = (cc & mask) == mask ? 0x20 : 0x00;
-			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b;;
+			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b;
 		}
 			break;
 
 		case IROp::FCmovVfpuCC:
-			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0x7f)) & 1) == (inst->src2 >> 7)) {
+			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0xf)) & 1) == (inst->src2 >> 7)) {
 				mips->f[inst->dest] = mips->f[inst->src1];
 			}
 			break;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index de97635ae8ae..1fd98a3158f9 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -1,3 +1,5 @@
+#include <utility>
+
 #include "Common/Log.h"
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/IR/IRRegCache.h"

From 91a6cf5e44e74e40ec740747a00444b543b30e32 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 20:14:03 +0200
Subject: [PATCH 69/77] Add a couple more passes (2-op, optimize f<->v fp
 moves)

---
 Core/MIPS/IR/IRCompVFPU.cpp     |   3 +-
 Core/MIPS/IR/IRFrontend.cpp     |   3 +-
 Core/MIPS/IR/IRPassSimplify.cpp | 108 ++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRPassSimplify.h   |   3 +
 4 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 74590a213a28..e0b5ecccd391 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1221,12 +1221,11 @@ namespace MIPSComp {
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
 
-		VCondition cond = (VCondition)(op & 0xF);
-
 		u8 sregs[4], tregs[4];
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 
+		VCondition cond = (VCondition)(op & 0xF);
 		int mask = 0;
 		for (int i = 0; i < n; i++) {
 			ir.Write(IROp::FCmpVfpuBit, cond | (i << 4), sregs[i], tregs[i]);
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index d615f936f986..c41c3cac06a8 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -232,8 +232,10 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 	IRWriter *code = &ir;
 	if (true) {
 		static const IRPassFunc passes[] = {
+			&OptimizeFPMoves,
 			&PropagateConstants,
 			&PurgeTemps,
+			// &ThreeOpToTwoOp,
 		};
 		if (IRApplyPasses(passes, ARRAY_SIZE(passes), ir, simplified))
 			logBlocks = 1;
@@ -286,5 +288,4 @@ void IRFrontend::Comp_RunBlock(MIPSOpcode op) {
 	ERROR_LOG(JIT, "Comp_RunBlock should never be reached!");
 }
 
-
 }  // namespace
\ No newline at end of file
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 1fd98a3158f9..4f9da4e49561 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -95,6 +95,114 @@ bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWri
 	return logBlocks;
 }
 
+bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
+	//FMovToGPR a0, f12
+	//FMovFromGPR f14, a0
+	// to
+	//FMovToGPR a0, f12
+	//FMov f14, f12
+
+	const u32 *constants = in.GetConstants().data();
+	bool logBlocks = false;
+	IRInst prev;
+	prev.op = IROp::Nop;
+	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
+		IRInst inst = in.GetInstructions()[i];
+		switch (inst.op) {
+		case IROp::FMovFromGPR:
+			if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
+				inst.op = IROp::FMov;
+				inst.src1 = prev.src1;
+				out.Write(inst);
+				logBlocks = true;
+			} else {
+				out.Write(inst);
+			}
+			break;
+		default:
+			// Remap constants to the new reality
+			const IRMeta *m = GetIRMeta(inst.op);
+			switch (m->types[0]) {
+			case 'C':
+				inst.dest = out.AddConstant(constants[inst.dest]);
+				break;
+			}
+			switch (m->types[1]) {
+			case 'C':
+				inst.src1 = out.AddConstant(constants[inst.src1]);
+				break;
+			}
+			switch (m->types[2]) {
+			case 'C':
+				inst.src2 = out.AddConstant(constants[inst.src2]);
+				break;
+			}
+			out.Write(inst);
+			break;
+		}
+		prev = inst;
+	}
+	return logBlocks;
+}
+
+// Might be useful later on x86.
+bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out) {
+	const u32 *constants = in.GetConstants().data();
+	bool logBlocks = false;
+	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
+		IRInst inst = in.GetInstructions()[i];
+		const IRMeta *meta = GetIRMeta(inst.op);
+		switch (inst.op) {
+		case IROp::Sub:
+		case IROp::Slt:
+		case IROp::SltU:
+		case IROp::Add:
+		case IROp::And:
+		case IROp::Or:
+		case IROp::Xor:
+			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
+				out.Write(IROp::Mov, inst.dest, inst.src1);
+				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
+			} else {
+				out.Write(inst);
+			}
+			break;
+		case IROp::FMul:
+		case IROp::FAdd:
+			if (inst.src1 != inst.dest && inst.src2 != inst.dest) {
+				out.Write(IROp::FMov, inst.dest, inst.src1);
+				out.Write(inst.op, inst.dest, inst.dest, inst.src2);
+			} else {
+				out.Write(inst);
+			}
+			break;
+		default:
+		{
+			// Remap constants to the new reality
+			const IRMeta *m = GetIRMeta(inst.op);
+			switch (m->types[0]) {
+			case 'C':
+				inst.dest = out.AddConstant(constants[inst.dest]);
+				break;
+			}
+			switch (m->types[1]) {
+			case 'C':
+				inst.src1 = out.AddConstant(constants[inst.src1]);
+				break;
+			}
+			switch (m->types[2]) {
+			case 'C':
+				inst.src2 = out.AddConstant(constants[inst.src2]);
+				break;
+			}
+			out.Write(inst);
+			break;
+		}
+		}
+	}
+	return logBlocks;
+}
+
 bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	IRRegCache gpr(&out);
 
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
index d6dd041259c2..80b979fbf88a 100644
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@@ -5,6 +5,9 @@
 typedef bool (*IRPassFunc)(const IRWriter &in, IRWriter &out);
 bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out);
 
+// Block optimizer passes of varying usefulness.
 bool PropagateConstants(const IRWriter &in, IRWriter &out);
 bool PurgeTemps(const IRWriter &in, IRWriter &out);
 bool ReduceLoads(const IRWriter &in, IRWriter &out);
+bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out);
+bool OptimizeFPMoves(const IRWriter &in, IRWriter &out);

From 5b2504120dabb47d1719b7a3902beb8a6825a9a4 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 20:15:20 +0200
Subject: [PATCH 70/77] Optimize some common prefixes

---
 Core/MIPS/IR/IRCompVFPU.cpp     | 35 +++++++++++++++++++++++++++++----
 Core/MIPS/IR/IRInst.cpp         |  2 ++
 Core/MIPS/IR/IRInst.h           |  2 ++
 Core/MIPS/IR/IRInterpreter.cpp  | 10 ++++++++++
 Core/MIPS/IR/IRPassSimplify.cpp |  4 +++-
 5 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index e0b5ecccd391..6c6a52949a6a 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -108,6 +108,13 @@ namespace MIPSComp {
 		}
 	}
 
+	static void InitRegs(u8 *vregs, int reg) {
+		vregs[0] = reg;
+		vregs[1] = reg + 1;
+		vregs[2] = reg + 2;
+		vregs[3] = reg + 3;
+	}
+
 	void IRFrontend::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg) {
 		if (prefix == 0xE4)
 			return;
@@ -119,6 +126,27 @@ namespace MIPSComp {
 		for (int i = 0; i < n; i++)
 			origV[i] = vregs[i];
 
+		// Some common vector prefixes
+		if (sz == V_Quad && IsConsecutive4(vregs)) {
+			if (prefix == 0xF00E4 && IsConsecutive4(vregs)) {
+				InitRegs(vregs, tempReg);
+				ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);
+				return;
+			}
+			if (prefix == 0x00FE4 && IsConsecutive4(vregs)) {
+				InitRegs(vregs, tempReg);
+				ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);
+				return;
+			}
+			// Pure shuffle
+			if (prefix == (prefix & 0xFF)) {
+				InitRegs(vregs, tempReg);
+				ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);
+				return;
+			}
+		}
+
+		// Alright, fall back to the generic approach.
 		for (int i = 0; i < n; i++) {
 			int regnum = (prefix >> (i * 2)) & 3;
 			int abs = (prefix >> (8 + i)) & 1;
@@ -395,7 +423,6 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, vt);
 		GetVectorRegsPrefixD(dregs, V_Single, vd);
 
-		// TODO: applyprefixST here somehow (shuffle, etc...)
 		ir.Write(IROp::FMul, IRVTEMP_0, sregs[0], tregs[0]);
 
 		int n = GetNumVectorElements(sz);
@@ -1050,7 +1077,7 @@ namespace MIPSComp {
 			}
 		} else if (sz == M_4x4) {
 			// Tekken 6 has a case here: MEE
-			logBlocks = 1;
+			// logBlocks = 1;
 		}
 
 		// Fallback. Expands a LOT
@@ -1141,8 +1168,8 @@ namespace MIPSComp {
 			tempregs[i] = temp;
 		}
 		for (int i = 0; i < n; i++) {
-			u8 temp = tempregs[i];
-			ir.Write(IROp::FMov, dregs[i], temp);
+			if (tempregs[i] != dregs[i])
+				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
 		}
 	}
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index 983a4e6a3a12..dac93617f235 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -109,6 +109,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::Vec4Mul, "Vec4Mul", "FFF" },
 	{ IROp::Vec4Scale, "Vec4Scale", "FFF" },
 	{ IROp::Vec4Dot, "Vec4Dot", "FFF" },
+	{ IROp::Vec4Neg, "Vec4Neg", "FF" },
+	{ IROp::Vec4Abs, "Vec4Abs", "FF" },
 
 	{ IROp::Interpret, "Interpret", "_C" },
 	{ IROp::Downcount, "Downcount", "_II" },
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index b33bec732eb5..86e1e31ae8b1 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -159,6 +159,8 @@ enum class IROp : u8 {
 	Vec4Div,
 	Vec4Scale,
 	Vec4Dot,
+	Vec4Neg,
+	Vec4Abs,
 
 	// vx2i
 	Vec4ExpandU16ToU32Hi,
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index c9d2b4ed1857..07ce9e1d79e2 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -212,6 +212,16 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 #endif
 			break;
 
+		case IROp::Vec4Neg:
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = -mips->f[inst->src1 + i];
+			break;
+
+		case IROp::Vec4Abs:
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = fabsf(mips->f[inst->src1 + i]);
+			break;
+
 		case IROp::FCmpVfpuBit:
 		{
 			int op = inst->dest & 0xF;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 4f9da4e49561..9572db730978 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -114,11 +114,11 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 				inst.op = IROp::FMov;
 				inst.src1 = prev.src1;
 				out.Write(inst);
-				logBlocks = true;
 			} else {
 				out.Write(inst);
 			}
 			break;
+
 		default:
 			// Remap constants to the new reality
 			const IRMeta *m = GetIRMeta(inst.op);
@@ -487,6 +487,8 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		case IROp::Vec4Dot:
 		case IROp::Vec4Scale:
 		case IROp::Vec4Shuffle:
+		case IROp::Vec4Neg:
+		case IROp::Vec4Abs:
 			out.Write(inst);
 			break;
 

From 5923013d659f5b45eeef7660607c853673559c96 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 20:21:19 +0200
Subject: [PATCH 71/77] Simple workaround for timing issue with coreState after
 syscall.

Also fixes off by one in ForceCheck.
---
 Core/CoreTiming.cpp            | 4 ++--
 Core/MIPS/IR/IRInterpreter.cpp | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Core/CoreTiming.cpp b/Core/CoreTiming.cpp
index 8e816fcc454d..feece5e5b33d 100644
--- a/Core/CoreTiming.cpp
+++ b/Core/CoreTiming.cpp
@@ -567,10 +567,10 @@ void MoveEvents()
 
 void ForceCheck()
 {
-	int cyclesExecuted = slicelength - currentMIPS->downcount;
+	int cyclesExecuted = slicelength - currentMIPS->downcount + 1;
 	globalTimer += cyclesExecuted;
 	// This will cause us to check for new events immediately.
-	currentMIPS->downcount = 0;
+	currentMIPS->downcount = -1;
 	// But let's not eat a bunch more time in Advance() because of this.
 	slicelength = 0;
 }
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index 07ce9e1d79e2..c0bcdb91ca3c 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -10,6 +10,8 @@
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"
+#include "Core/System.h"
+#include "Core/CoreTiming.h"
 
 #include "math/math_util.h"
 #include "Common/CommonTypes.h"
@@ -583,6 +585,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, const u32 *constPool, int c
 		{
 			MIPSOpcode op(constPool[inst->src1]);
 			CallSyscall(op);
+			if (coreState != CORE_RUNNING)
+				CoreTiming::ForceCheck();
 			return mips->pc;
 		}
 

From b7091a8f5da11bb8c02ebdbbf886f9b214134a10 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 21:02:23 +0200
Subject: [PATCH 72/77] Simplifications and fixes

---
 Core/CoreTiming.cpp             |  4 +--
 Core/MIPS/IR/IRCompVFPU.cpp     | 23 +++++++++++++----
 Core/MIPS/IR/IRPassSimplify.cpp | 44 ++++++---------------------------
 3 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/Core/CoreTiming.cpp b/Core/CoreTiming.cpp
index feece5e5b33d..61956cf1e3a1 100644
--- a/Core/CoreTiming.cpp
+++ b/Core/CoreTiming.cpp
@@ -567,12 +567,12 @@ void MoveEvents()
 
 void ForceCheck()
 {
-	int cyclesExecuted = slicelength - currentMIPS->downcount + 1;
+	int cyclesExecuted = slicelength - currentMIPS->downcount;
 	globalTimer += cyclesExecuted;
 	// This will cause us to check for new events immediately.
 	currentMIPS->downcount = -1;
 	// But let's not eat a bunch more time in Advance() because of this.
-	slicelength = 0;
+	slicelength = 1;
 }
 
 void Advance()
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 6c6a52949a6a..2160175cd286 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -128,12 +128,12 @@ namespace MIPSComp {
 
 		// Some common vector prefixes
 		if (sz == V_Quad && IsConsecutive4(vregs)) {
-			if (prefix == 0xF00E4 && IsConsecutive4(vregs)) {
+			if (prefix == 0xF00E4) {
 				InitRegs(vregs, tempReg);
 				ir.Write(IROp::Vec4Neg, vregs[0], origV[0]);
 				return;
 			}
-			if (prefix == 0x00FE4 && IsConsecutive4(vregs)) {
+			if (prefix == 0x00FE4) {
 				InitRegs(vregs, tempReg);
 				ir.Write(IROp::Vec4Abs, vregs[0], origV[0]);
 				return;
@@ -1123,7 +1123,7 @@ namespace MIPSComp {
 		GetVectorRegs(dregs, sz, _VD);
 
 		// SIMD-optimized implementations - if sregs[0..3] is consecutive, the rest are too.
-		if (msz == M_4x4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
+		if (msz == M_4x4 && IsConsecutive4(sregs)) {
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
 			if (!IsConsecutive4(tregs)) {
@@ -1136,13 +1136,26 @@ namespace MIPSComp {
 						ir.Write(IROp::Vec4Add, s0, s0, sregs[i * 4]);
 					}
 				}
-				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+
+				if (IsConsecutive4(dregs)) {
+					ir.Write(IROp::Vec4Mov, dregs[0], s0);
+				} else {
+					for (int i = 0; i < 4; i++) {
+						ir.Write(IROp::FMov, dregs[i], s0 + i);
+					}
+				}
 				return;
 			} else if (!homogenous) {
 				for (int i = 0; i < 4; i++) {
 					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[0]);
 				}
-				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+				if (IsConsecutive4(dregs)) {
+					ir.Write(IROp::Vec4Mov, dregs[0], s0);
+				} else {
+					for (int i = 0; i < 4; i++) {
+						ir.Write(IROp::FMov, dregs[i], s0 + i);
+					}
+				}
 				return;
 			}
 		} else if (msz == M_4x4) {
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 9572db730978..637e6b22fae4 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -120,28 +120,15 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 			break;
 
 		default:
-			// Remap constants to the new reality
-			const IRMeta *m = GetIRMeta(inst.op);
-			switch (m->types[0]) {
-			case 'C':
-				inst.dest = out.AddConstant(constants[inst.dest]);
-				break;
-			}
-			switch (m->types[1]) {
-			case 'C':
-				inst.src1 = out.AddConstant(constants[inst.src1]);
-				break;
-			}
-			switch (m->types[2]) {
-			case 'C':
-				inst.src2 = out.AddConstant(constants[inst.src2]);
-				break;
-			}
 			out.Write(inst);
 			break;
 		}
 		prev = inst;
 	}
+	// Can reuse the old constants array - not touching constants in this pass.
+	for (u32 value : in.GetConstants()) {
+		out.AddConstant(value);
+	}
 	return logBlocks;
 }
 
@@ -177,28 +164,13 @@ bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 		default:
-		{
-			// Remap constants to the new reality
-			const IRMeta *m = GetIRMeta(inst.op);
-			switch (m->types[0]) {
-			case 'C':
-				inst.dest = out.AddConstant(constants[inst.dest]);
-				break;
-			}
-			switch (m->types[1]) {
-			case 'C':
-				inst.src1 = out.AddConstant(constants[inst.src1]);
-				break;
-			}
-			switch (m->types[2]) {
-			case 'C':
-				inst.src2 = out.AddConstant(constants[inst.src2]);
-				break;
-			}
 			out.Write(inst);
 			break;
 		}
-		}
+	}
+	// Can reuse the old constants array - not touching constants in this pass.
+	for (u32 value : in.GetConstants()) {
+		out.AddConstant(value);
 	}
 	return logBlocks;
 }

From b7224e269c822725ff2dc69220f18e6f31a2acf5 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 13 May 2016 19:50:25 -0700
Subject: [PATCH 73/77] Qt: Remove CPU core option from menu.

We've removed on Windows too, and this fixes a build error.
---
 Qt/Core.pro       | 2 ++
 Qt/mainwindow.cpp | 2 --
 Qt/mainwindow.h   | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Qt/Core.pro b/Qt/Core.pro
index 13827a029d01..8a23d36e0e41 100644
--- a/Qt/Core.pro
+++ b/Qt/Core.pro
@@ -38,6 +38,7 @@ SOURCES += $$P/Core/*.cpp \ # Core
 	$$P/Core/HLE/*.cpp \
 	$$P/Core/HW/*.cpp \
 	$$P/Core/MIPS/*.cpp \
+	$$P/Core/MIPS/IR/*.cpp \
 	$$P/Core/MIPS/JitCommon/*.cpp \
 	$$P/Core/Util/AudioFormat.cpp \
 	$$P/Core/Util/BlockAllocator.cpp \
@@ -56,6 +57,7 @@ HEADERS += $$P/Core/*.h \
 	$$P/Core/HLE/*.h \
 	$$P/Core/HW/*.h \
 	$$P/Core/MIPS/*.h \
+	$$P/Core/MIPS/IR/*.h \
 	$$P/Core/MIPS/JitCommon/*.h \
 	$$P/Core/Util/AudioFormat.h \
 	$$P/Core/Util/BlockAllocator.h \
diff --git a/Qt/mainwindow.cpp b/Qt/mainwindow.cpp
index 12862659c021..9ceca618ab45 100644
--- a/Qt/mainwindow.cpp
+++ b/Qt/mainwindow.cpp
@@ -515,8 +515,6 @@ void MainWindow::createMenus()
 	MenuTree* optionsMenu = new MenuTree(this, menuBar(), QT_TR_NOOP("&Options"));
 	// - Core
 	MenuTree* coreMenu = new MenuTree(this, optionsMenu,      QT_TR_NOOP("&Core"));
-	coreMenu->add(new MenuAction(this, SLOT(dynarecAct()),        QT_TR_NOOP("&CPU Dynarec")))
-		->addEventChecked(&g_Config.bJit);
 	coreMenu->add(new MenuAction(this, SLOT(vertexDynarecAct()),  QT_TR_NOOP("&Vertex Decoder Dynarec")))
 		->addEventChecked(&g_Config.bVertexDecoderJit);
 	coreMenu->add(new MenuAction(this, SLOT(fastmemAct()),        QT_TR_NOOP("Fast &Memory (unstable)")))
diff --git a/Qt/mainwindow.h b/Qt/mainwindow.h
index ae201054aa1f..1428f4dea8bc 100644
--- a/Qt/mainwindow.h
+++ b/Qt/mainwindow.h
@@ -87,7 +87,6 @@ private slots:
 
 	// Options
 	// Core
-	void dynarecAct() { g_Config.iCpuCore = g_Config.iCpuCore == CPU_CORE_INTERPRETER ? CPU_CORE_JIT : CPU_CORE_INTERPRETER; }
 	void vertexDynarecAct() { g_Config.bVertexDecoderJit = !g_Config.bVertexDecoderJit; }
 	void fastmemAct() { g_Config.bFastMemory = !g_Config.bFastMemory; }
 	void ignoreIllegalAct() { g_Config.bIgnoreBadMemAccess = !g_Config.bIgnoreBadMemAccess; }

From efc8a8e3531f27dd2063e6ab7d726da60d487551 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 13 May 2016 20:17:20 -0700
Subject: [PATCH 74/77] Hack to make Symbian build.

---
 Core/MIPS/IR/IRCompBranch.cpp   |  2 --
 Core/MIPS/IR/IRFrontend.cpp     |  4 ++--
 Core/MIPS/IR/IRInst.h           | 10 ++++++++++
 Core/MIPS/IR/IRJit.h            | 26 ++++++++++++++++++++++++++
 Core/MIPS/IR/IRPassSimplify.cpp |  4 +---
 5 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/Core/MIPS/IR/IRCompBranch.cpp b/Core/MIPS/IR/IRCompBranch.cpp
index f7b875100df4..609b8f478fd1 100644
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@@ -31,8 +31,6 @@
 #include "Core/MIPS/IR/IRFrontend.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 
-#include "Common/Arm64Emitter.h"
-
 #define _RS MIPS_GET_RS(op)
 #define _RT MIPS_GET_RT(op)
 #define _RD MIPS_GET_RD(op)
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
index c41c3cac06a8..6353816407a9 100644
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@@ -261,7 +261,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 		ILOG("=============== Original IR (%d instructions, %d const) ===============", (int)ir.GetInstructions().size(), (int)ir.GetConstants().size());
 		for (size_t i = 0; i < ir.GetInstructions().size(); i++) {
 			char buf[256];
-			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], ir.GetConstants().data());
+			DisassembleIR(buf, sizeof(buf), ir.GetInstructions()[i], &ir.GetConstants()[0]);
 			ILOG("%s", buf);
 		}
 		ILOG("===============        end         =================");
@@ -271,7 +271,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, std::v
 		ILOG("=============== IR (%d instructions, %d const) ===============", (int)code->GetInstructions().size(), (int)code->GetConstants().size());
 		for (size_t i = 0; i < code->GetInstructions().size(); i++) {
 			char buf[256];
-			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], code->GetConstants().data());
+			DisassembleIR(buf, sizeof(buf), code->GetInstructions()[i], &code->GetConstants()[0]);
 			ILOG("%s", buf);
 		}
 		ILOG("===============        end         =================");
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 86e1e31ae8b1..b1e17a6c45c7 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -6,6 +6,16 @@
 #include "Common/CommonTypes.h"
 #include "Core/MIPS/MIPS.h"
 
+#ifdef __SYMBIAN32__
+// Seems std::move() doesn't exist, so assuming it can't do moves at all.
+namespace std {
+	template <typename T>
+	const T &move(const T &x) {
+		return x;
+	}
+};
+#endif
+
 // Basic IR
 //
 // This IR refers implicitly to the MIPS register set and is simple to interpret.
diff --git a/Core/MIPS/IR/IRJit.h b/Core/MIPS/IR/IRJit.h
index 87a8231bff18..38f0df6a3e3e 100644
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@@ -19,6 +19,7 @@
 
 #include <cstring>
 
+#include "Common/Common.h"
 #include "Common/CPUDetect.h"
 #include "Core/MIPS/JitCommon/JitBlockCache.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
@@ -49,6 +50,31 @@ class IRBlock {
 		b.const_ = nullptr;
 	}
 
+	IRBlock(const IRBlock &b) {
+		*this = b;
+	}
+
+	IRBlock &operator=(const IRBlock &b) {
+		// No std::move on Symbian...  But let's try not to use elsewhere.
+#ifndef __SYMBIAN32__
+		_assert_(false);
+#endif
+		numInstructions_ = b.numInstructions_;
+		numConstants_ = b.numConstants_;
+		instr_ = new IRInst[numInstructions_];
+		if (numInstructions_) {
+			memcpy(instr_, b.instr_, sizeof(IRInst) * numInstructions_);
+		}
+		const_ = new u32[numConstants_];
+		if (numConstants_) {
+			memcpy(const_, b.const_, sizeof(u32) * numConstants_);
+		}
+		origAddr_ = b.origAddr_;
+		origFirstOpcode_ = b.origFirstOpcode_;
+
+		return *this;
+	}
+
 	~IRBlock() {
 		delete[] instr_;
 		delete[] const_;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 637e6b22fae4..eb0b892b06f4 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -102,7 +102,6 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 	//FMovToGPR a0, f12
 	//FMov f14, f12
 
-	const u32 *constants = in.GetConstants().data();
 	bool logBlocks = false;
 	IRInst prev;
 	prev.op = IROp::Nop;
@@ -134,7 +133,6 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 
 // Might be useful later on x86.
 bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out) {
-	const u32 *constants = in.GetConstants().data();
 	bool logBlocks = false;
 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
 		IRInst inst = in.GetInstructions()[i];
@@ -178,7 +176,7 @@ bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out) {
 bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 	IRRegCache gpr(&out);
 
-	const u32 *constants = in.GetConstants().data();
+	const u32 *constants = &in.GetConstants()[0];
 	bool logBlocks = false;
 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
 		IRInst inst = in.GetInstructions()[i];

From e2aca38f8b3990adb57951f17f5873e8015f57d2 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 13 May 2016 21:48:23 -0700
Subject: [PATCH 75/77] Try enabling Travis caching.

---
 .travis.sh  |  5 ++++-
 .travis.yml | 43 +++++++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/.travis.sh b/.travis.sh
index 4123e457294a..60adacf9c2e1 100644
--- a/.travis.sh
+++ b/.travis.sh
@@ -1,5 +1,6 @@
 #/bin/bash
 
+export NDK_CCACHE=ccache
 NDK_VER=android-ndk-r10d
 
 download_extract() {
@@ -85,7 +86,7 @@ travis_script() {
 	# Compile PPSSPP
 	if [ "$PPSSPP_BUILD_TYPE" = "Linux" ]; then
 		if [ "$CXX" = "g++" ]; then
-			export CXX="g++-4.8" CC="gcc-4.8"
+			export CXX="ccache g++-4.8" CC="ccache gcc-4.8"
 		fi
 
 		if [ "$QT" = "TRUE" ]; then
@@ -123,6 +124,8 @@ travis_script() {
 }
 
 travis_after_success() {
+	ccache -s
+
 	if [ "$PPSSPP_BUILD_TYPE" = "Linux" ]; then
 		./test.py
 	fi
diff --git a/.travis.yml b/.travis.yml
index c4192ea5e36f..c6dd38b16d6a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,20 +4,19 @@ sudo: required
 
 language: cpp
 
-os: linux
-
 dist: trusty
 
-compiler:
-  - gcc
+addons:
+  apt:
+    packages:
+      - build-essential
+      - libgl1-mesa-dev
+      - libglu1-mesa-dev
+      - cmake
 
-env:
-  - PPSSPP_BUILD_TYPE=Linux
-    CMAKE=TRUE
-  - PPSSPP_BUILD_TYPE=Android
-  - PPSSPP_BUILD_TYPE=Blackberry
-    CMAKE=TRUE
-  - PPSSPP_BUILD_TYPE=Symbian
+cache:
+  - apt
+  - ccache
 
 notifications:
   irc:
@@ -30,15 +29,31 @@ notifications:
 
 matrix:
   include:
-    - compiler: clang
+    - os: linux
+      compiler: "gcc linux"
+      env: PPSSPP_BUILD_TYPE=Linux
+           CMAKE=TRUE
+    - os: linux
+      compiler: "gcc android"
+      env: PPSSPP_BUILD_TYPE=Android
+    - os: linux
+      compiler: "gcc blackberry"
+      env: PPSSPP_BUILD_TYPE=Blackberry
+           CMAKE=TRUE
+    - os: linux
+      compiler: "gcc symbian"
+      env: PPSSPP_BUILD_TYPE=Symbian
+    - os: linux
+      compiler: "clang linux"
       env: PPSSPP_BUILD_TYPE=Linux
            CMAKE=TRUE
-    - compiler: gcc
+    - os: linux
+      compiler: "gcc qt"
       env: PPSSPP_BUILD_TYPE=Linux
            QT=TRUE
 # Can't get iOS to work.
 #    - os: osx
-#      compiler: clang
+#      compiler: "clang ios"
 #      env: PPSSPP_BUILD_TYPE=iOS
 #           CMAKE=TRUE
 

From 7a7c3b9b9fa42a7254e33a3b9738cee831ab4048 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Fri, 13 May 2016 22:58:10 +0200
Subject: [PATCH 76/77] More VFPU, vmmul thoughts

---
 Core/MIPS/IR/IRBackendX86.cpp   | 622 ++++++++++++++++++++++++++++++++
 Core/MIPS/IR/IRCompVFPU.cpp     | 138 ++++---
 Core/MIPS/IR/IRPassSimplify.cpp |  78 ++--
 3 files changed, 760 insertions(+), 78 deletions(-)
 create mode 100644 Core/MIPS/IR/IRBackendX86.cpp

diff --git a/Core/MIPS/IR/IRBackendX86.cpp b/Core/MIPS/IR/IRBackendX86.cpp
new file mode 100644
index 000000000000..388962d938b4
--- /dev/null
+++ b/Core/MIPS/IR/IRBackendX86.cpp
@@ -0,0 +1,622 @@
+
+
+#include "Common/x64Emitter.h"
+#include "Core/MIPS/IR/IRInst.h"
+#include "Core/MemMap.h"
+
+// Still need a register cache
+struct Mapping {
+	Gen::OpArg dst;
+	Gen::OpArg src1;
+	Gen::OpArg src2;
+};
+
+class RegisterMap {
+public:
+	Mapping Map(IRInst inst);
+private:
+};
+
+Mapping RegisterMap::Map(IRInst inst) {
+	Mapping map;
+	return map;
+}
+
+
+class IRBackendX86 : public Gen::XCodeBlock {
+public:
+	void Compile(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count);
+};
+
+void IRBackendX86::Compile(MIPSState *mips, const IRInst *inst, const u32 *constPool, int count) {
+	RegisterMap regMap;
+	using namespace Gen;
+
+	const IRInst *end = inst + count;
+	while (inst != end) {
+		Mapping map = regMap.Map(*inst);
+		switch (inst->op) {
+		/*
+		case IROp::Nop:
+			break;
+		case IROp::SetConst:
+			MOV(32, map.dst, map.src1);
+			break;
+		case IROp::SetConstF:
+			MOV(32, R(EAX), map.src1);
+			MOVD_xmm(map.dst, EAX);
+			break;
+		case IROp::Add:
+			if (map.src1.IsSimpleReg() && map.src2.IsSimpleReg()) {
+				LEA(32, map.dst.GetSimpleReg(), MRegSum(map.src1.GetSimpleReg(), map.src2.GetSimpleReg()));
+				break;
+			}
+			mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];
+			break;
+		case IROp::Sub:
+			mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];
+			break;
+		case IROp::And:
+			mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];
+			break;
+		case IROp::Or:
+			mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];
+			break;
+		case IROp::Xor:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];
+			break;
+		case IROp::Mov:
+			mips->r[inst->dest] = mips->r[inst->src1];
+			break;
+		case IROp::AddConst:
+			mips->r[inst->dest] = mips->r[inst->src1] + constPool[inst->src2];
+			break;
+		case IROp::SubConst:
+			mips->r[inst->dest] = mips->r[inst->src1] - constPool[inst->src2];
+			break;
+		case IROp::AndConst:
+			mips->r[inst->dest] = mips->r[inst->src1] & constPool[inst->src2];
+			break;
+		case IROp::OrConst:
+			mips->r[inst->dest] = mips->r[inst->src1] | constPool[inst->src2];
+			break;
+		case IROp::XorConst:
+			mips->r[inst->dest] = mips->r[inst->src1] ^ constPool[inst->src2];
+			break;
+		case IROp::Neg:
+			mips->r[inst->dest] = -(s32)mips->r[inst->src1];
+			break;
+		case IROp::Not:
+			mips->r[inst->dest] = ~mips->r[inst->src1];
+			break;
+		case IROp::Ext8to32:
+			mips->r[inst->dest] = (s32)(s8)mips->r[inst->src1];
+			break;
+		case IROp::Ext16to32:
+			mips->r[inst->dest] = (s32)(s16)mips->r[inst->src1];
+			break;
+
+		case IROp::Load8:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load8Ext:
+			mips->r[inst->dest] = (s32)(s8)Memory::ReadUnchecked_U8(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load16Ext:
+			mips->r[inst->dest] = (s32)(s16)Memory::ReadUnchecked_U16(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Load32:
+			mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::LoadFloat:
+			mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::Store8:
+			Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store16:
+			Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::Store32:
+			Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+		case IROp::StoreFloat:
+			Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + constPool[inst->src2]);
+			break;
+
+		case IROp::LoadVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps((const float *)Memory::GetPointerUnchecked(base)));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = Memory::ReadUnchecked_Float(base + 4 * i);
+#endif
+			break;
+		}
+		case IROp::StoreVec4:
+		{
+			u32 base = mips->r[inst->src1] + constPool[inst->src2];
+#if defined(_M_SSE)
+			_mm_store_ps((float *)Memory::GetPointerUnchecked(base), _mm_load_ps(&mips->f[inst->dest]));
+#else
+			for (int i = 0; i < 4; i++)
+				Memory::WriteUnchecked_Float(mips->f[inst->dest + i], base + 4 * i);
+#endif
+			break;
+		}
+
+		case IROp::Vec4Init:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(vec4InitValues[inst->src1]));
+#else
+			memcpy(&mips->f[inst->dest + i], vec4InitValues[inst->src1], 4 * sizeof(float));
+#endif
+			break;
+
+		case IROp::Vec4Shuffle:
+		{
+			// Can't use the SSE shuffle here because it takes an immediate.
+			// Backends with SSE support could use that though.
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
+			break;
+		}
+
+		case IROp::Vec4Mov:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
+#else
+			memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
+#endif
+			break;
+
+		case IROp::Vec4Add:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Sub:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Mul:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Div:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::Vec4Scale:
+#if defined(_M_SSE)
+			_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
+#else
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+#endif
+			break;
+
+		case IROp::FCmpVfpuBit:
+		{
+			int op = inst->dest & 0xF;
+			int bit = inst->dest >> 4;
+			int result = 0;
+			switch (op) {
+			case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break;
+			case VC_NE: result = mips->f[inst->src1] != mips->f[inst->src2]; break;
+			case VC_LT: result = mips->f[inst->src1] < mips->f[inst->src2]; break;
+			case VC_LE: result = mips->f[inst->src1] <= mips->f[inst->src2]; break;
+			case VC_GT: result = mips->f[inst->src1] > mips->f[inst->src2]; break;
+			case VC_GE: result = mips->f[inst->src1] >= mips->f[inst->src2]; break;
+			case VC_EZ: result = mips->f[inst->src1] == 0.0f; break;
+			case VC_NZ: result = mips->f[inst->src1] != 0.0f; break;
+			case VC_TR: result = 1; break;
+			case VC_FL: result = 0; break;
+			default:
+				result = 0;
+			}
+			if (result != 0) {
+				mips->vfpuCtrl[VFPU_CTRL_CC] |= (1 << bit);
+			} else {
+				mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit);
+			}
+		}
+		break;
+
+		case IROp::FCmpVfpuAggregate:
+		{
+			int mask = inst->dest;
+			u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
+			int a = (cc & mask) ? 0x10 : 0x00;
+			int b = (cc & mask) == mask ? 0x20 : 0x00;
+			mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | a | b;;
+		}
+		break;
+
+		case IROp::FCmovVfpuCC:
+			if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0x7f)) & 1) == (inst->src2 >> 7)) {
+				mips->f[inst->dest] = mips->f[inst->src1];
+			}
+			break;
+
+			// Not quickly implementable on all platforms, unfortunately.
+		case IROp::Vec4Dot:
+		{
+			float dot = mips->f[inst->src1] * mips->f[inst->src2];
+			for (int i = 1; i < 4; i++)
+				dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
+			mips->f[inst->dest] = dot;
+			break;
+		}
+
+		case IROp::FSin:
+			mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);
+			break;
+		case IROp::FCos:
+			mips->f[inst->dest] = vfpu_cos(mips->f[inst->src1]);
+			break;
+		case IROp::FRSqrt:
+			mips->f[inst->dest] = 1.0f / sqrtf(mips->f[inst->src1]);
+			break;
+		case IROp::FRecip:
+			mips->f[inst->dest] = 1.0f / mips->f[inst->src1];
+			break;
+		case IROp::FAsin:
+			mips->f[inst->dest] = vfpu_asin(mips->f[inst->src1]);
+			break;
+
+		case IROp::ShlImm:
+			mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;
+			break;
+		case IROp::ShrImm:
+			mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;
+			break;
+		case IROp::SarImm:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;
+			break;
+		case IROp::RorImm:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = inst->src2;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+		break;
+
+		case IROp::Shl:
+			mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Shr:
+			mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Sar:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);
+			break;
+		case IROp::Ror:
+		{
+			u32 x = mips->r[inst->src1];
+			int sa = mips->r[inst->src2] & 31;
+			mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));
+		}
+		break;
+
+		case IROp::Clz:
+		{
+			int x = 31;
+			int count = 0;
+			int value = mips->r[inst->src1];
+			while (x >= 0 && !(value & (1 << x))) {
+				count++;
+				x--;
+			}
+			mips->r[inst->dest] = count;
+			break;
+		}
+
+		case IROp::Slt:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];
+			break;
+
+		case IROp::SltU:
+			mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];
+			break;
+
+		case IROp::SltConst:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)constPool[inst->src2];
+			break;
+
+		case IROp::SltUConst:
+			mips->r[inst->dest] = mips->r[inst->src1] < constPool[inst->src2];
+			break;
+
+		case IROp::MovZ:
+			if (mips->r[inst->src1] == 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+		case IROp::MovNZ:
+			if (mips->r[inst->src1] != 0)
+				mips->r[inst->dest] = mips->r[inst->src2];
+			break;
+
+		case IROp::Max:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+		case IROp::Min:
+			mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];
+			break;
+
+		case IROp::MtLo:
+			mips->lo = mips->r[inst->src1];
+			break;
+		case IROp::MtHi:
+			mips->hi = mips->r[inst->src1];
+			break;
+		case IROp::MfLo:
+			mips->r[inst->dest] = mips->lo;
+			break;
+		case IROp::MfHi:
+			mips->r[inst->dest] = mips->hi;
+			break;
+
+		case IROp::Mult:
+		{
+			s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+		case IROp::MultU:
+		{
+			u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];
+			memcpy(&mips->lo, &result, 8);
+			break;
+		}
+
+		case IROp::BSwap16:
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
+			break;
+		}
+		case IROp::BSwap32:
+		{
+			u32 x = mips->r[inst->src1];
+			mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
+			break;
+		}
+
+		case IROp::FAdd:
+			mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];
+			break;
+		case IROp::FSub:
+			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
+			break;
+		case IROp::FMul:
+			mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
+			break;
+		case IROp::FDiv:
+			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
+			break;
+		case IROp::FMin:
+			mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);
+			break;
+		case IROp::FMax:
+			mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);
+			break;
+
+		case IROp::FMov:
+			mips->f[inst->dest] = mips->f[inst->src1];
+			break;
+		case IROp::FAbs:
+			mips->f[inst->dest] = fabsf(mips->f[inst->src1]);
+			break;
+		case IROp::FSqrt:
+			mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);
+			break;
+		case IROp::FNeg:
+			mips->f[inst->dest] = -mips->f[inst->src1];
+			break;
+		case IROp::FSat0_1:
+			mips->f[inst->dest] = clamp_value(mips->f[inst->src1], 0.0f, 1.0f);
+			break;
+		case IROp::FSatMinus1_1:
+			mips->f[inst->dest] = clamp_value(mips->f[inst->src1], -1.0f, 1.0f);
+			break;
+
+		case IROp::FpCondToReg:
+			mips->r[inst->dest] = mips->fpcond;
+			break;
+		case IROp::VfpuCtrlToReg:
+			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
+			break;
+		case IROp::FRound:
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1] + 0.5f);
+			break;
+		case IROp::FTrunc:
+		{
+			float src = mips->f[inst->src1];
+			if (src >= 0.0f) {
+				mips->fs[inst->dest] = (int)floorf(src);
+				// Overflow, but it was positive.
+				if (mips->fs[inst->dest] == -2147483648LL) {
+					mips->fs[inst->dest] = 2147483647LL;
+				}
+			} else {
+				// Overflow happens to be the right value anyway.
+				mips->fs[inst->dest] = (int)ceilf(src);
+			}
+			break;
+		}
+		case IROp::FCeil:
+			mips->fs[inst->dest] = (int)ceilf(mips->f[inst->src1]);
+			break;
+		case IROp::FFloor:
+			mips->fs[inst->dest] = (int)floorf(mips->f[inst->src1]);
+			break;
+		case IROp::FCmp:
+			switch (inst->dest) {
+			case IRFpCompareMode::False:
+				mips->fpcond = 0;
+				break;
+			case IRFpCompareMode::EqualOrdered:
+			case IRFpCompareMode::EqualUnordered:
+				mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];
+				break;
+			case IRFpCompareMode::LessEqualOrdered:
+			case IRFpCompareMode::LessEqualUnordered:
+				mips->fpcond = mips->f[inst->src1] <= mips->f[inst->src2];
+				break;
+			case IRFpCompareMode::LessOrdered:
+			case IRFpCompareMode::LessUnordered:
+				mips->fpcond = mips->f[inst->src1] < mips->f[inst->src2];
+				break;
+			}
+			break;
+
+		case IROp::FCvtSW:
+			mips->f[inst->dest] = (float)mips->fs[inst->src1];
+			break;
+		case IROp::FCvtWS:
+		{
+			float src = mips->f[inst->src1];
+			if (my_isnanorinf(src))
+			{
+				mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;
+				break;
+			}
+			switch (mips->fcr31 & 3)
+			{
+			case 0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;  // RINT_0
+			case 1: mips->fs[inst->dest] = (int)src; break;  // CAST_1
+			case 2: mips->fs[inst->dest] = (int)ceilf(src); break;  // CEIL_2
+			case 3: mips->fs[inst->dest] = (int)floorf(src); break;  // FLOOR_3
+			}
+			break; //cvt.w.s
+		}
+
+		case IROp::ZeroFpCond:
+			mips->fpcond = 0;
+			break;
+
+		case IROp::FMovFromGPR:
+			memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);
+			break;
+		case IROp::FMovToGPR:
+			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
+			break;
+
+		case IROp::ExitToConst:
+			return constPool[inst->dest];
+
+		case IROp::ExitToReg:
+			return mips->r[inst->dest];
+
+		case IROp::ExitToConstIfEq:
+			if (mips->r[inst->src1] == mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfNeq:
+			if (mips->r[inst->src1] != mips->r[inst->src2])
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGtZ:
+			if ((s32)mips->r[inst->src1] > 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfGeZ:
+			if ((s32)mips->r[inst->src1] >= 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLtZ:
+			if ((s32)mips->r[inst->src1] < 0)
+				return constPool[inst->dest];
+			break;
+		case IROp::ExitToConstIfLeZ:
+			if ((s32)mips->r[inst->src1] <= 0)
+				return constPool[inst->dest];
+			break;
+
+		case IROp::Downcount:
+			mips->downcount -= (inst->src1) | ((inst->src2) << 8);
+			break;
+
+		case IROp::SetPC:
+			mips->pc = mips->r[inst->src1];
+			break;
+
+		case IROp::SetPCConst:
+			mips->pc = constPool[inst->src1];
+			break;
+
+		case IROp::Syscall:
+			// SetPC was executed before.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			CallSyscall(op);
+			return mips->pc;
+		}
+
+		case IROp::Interpret:  // SLOW fallback. Can be made faster.
+		{
+			MIPSOpcode op(constPool[inst->src1]);
+			MIPSInterpret(op);
+			break;
+		}
+
+		case IROp::CallReplacement:
+		{
+			int funcIndex = constPool[inst->src1];
+			const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
+			int cycles = f->replaceFunc();
+			mips->downcount -= cycles;
+			break;
+		}
+
+		case IROp::Break:
+			Crash();
+			break;
+
+		case IROp::SetCtrlVFPU:
+			mips->vfpuCtrl[inst->dest] = constPool[inst->src1];
+			break;
+
+		case IROp::SetCtrlVFPUReg:
+			mips->vfpuCtrl[inst->dest] = mips->r[inst->src1];
+			break;
+
+		case IROp::SetCtrlVFPUFReg:
+			memcpy(&mips->vfpuCtrl[inst->dest], &mips->f[inst->src1], 4);
+			break;
+			*/
+		default:
+			Crash();
+		}
+		inst++;
+	}
+
+	// If we got here, the block was badly constructed.
+	Crash();
+}
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 2160175cd286..1b484fbb602d 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -483,7 +483,9 @@ namespace MIPSComp {
 		int vd = _VD;
 		int vs = _VS;
 		int vt = _VT;
+
 		VectorSize sz = GetVecSize(op);
+		int n = GetNumVectorElements(sz);
 
 		// TODO: Force read one of them into regs? probably not.
 		u8 sregs[4], tregs[4], dregs[1];
@@ -491,10 +493,15 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, vt);
 		GetVectorRegsPrefixD(dregs, V_Single, vd);
 
+		if (sz == V_Quad && IsConsecutive4(sregs) && IsConsecutive4(tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
+			ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
+			ApplyPrefixD(dregs, V_Single);
+			return;
+		}
+
 		int temp0 = IRVTEMP_0;
 		int temp1 = IRVTEMP_0 + 1;
 		ir.Write(IROp::FMul, temp0, sregs[0], tregs[0]);
-		int n = GetNumVectorElements(sz);
 		for (int i = 1; i < n; i++) {
 			ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
 			ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
@@ -681,7 +688,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixD(dregs, sz, vd);
 
 		bool usingTemps = false;
-		int tempregs[4];
+		u8 tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
 				usingTemps = true;
@@ -790,7 +797,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		int tempregs[4];
+		u8 tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
 				tempregs[i] = IRVTEMP_PFX_T + i;  // Need IRVTEMP_0 for the scaling factor
@@ -976,31 +983,35 @@ namespace MIPSComp {
 		VectorSize sz = GetVecSize(op);
 		int n = GetNumVectorElements(sz);
 
+		int vs = _VS;
+		int vd = _VD;
+		int vt = _VT;
 		u8 sregs[4], dregs[4], treg;
-		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixS(sregs, sz, vs);
 		// TODO: Prefixes seem strange...
-		GetVectorRegsPrefixT(&treg, V_Single, _VT);
-		GetVectorRegsPrefixD(dregs, sz, _VD);
+		GetVectorRegsPrefixT(&treg, V_Single, vt);
+		GetVectorRegsPrefixD(dregs, sz, vd);
 
 		bool overlap = false;
 		// For prefixes to work, we just have to ensure that none of the output registers spill
 		// and that there's no overlap.
-		int tempregs[4];
+		u8 tempregs[4];
+		memcpy(tempregs, dregs, sizeof(tempregs));
 		for (int i = 0; i < n; ++i) {
 			// Conservative, can be improved
 			if (treg == dregs[i] || !IsOverlapSafe(dregs[i], n, sregs)) {
 				// Need to use temp regs
 				tempregs[i] = IRVTEMP_0 + i;
 				overlap = true;
-			} else {
-				tempregs[i] = dregs[i];
 			}
 		}
 
-		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs) && !overlap) {
-			ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
-			ApplyPrefixD(dregs, sz);
-			return;
+		if (n == 4 && IsConsecutive4(sregs) && IsConsecutive4(dregs)) {
+			if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
+				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
+				ApplyPrefixD(dregs, sz);
+				return;
+			}
 		}
 
 		for (int i = 0; i < n; i++) {
@@ -1017,6 +1028,21 @@ namespace MIPSComp {
 		ApplyPrefixD(dregs, sz);
 	}
 
+	/*
+	// Capital = straight, lower case = transposed
+	// 8 possibilities:
+	ABC   2
+	ABc   missing
+	AbC   1
+	Abc   1
+
+	aBC = ACB    2 + swap
+	aBc = AcB    1 + swap
+	abC = ACb    missing
+	abc = Acb    1 + swap
+
+	*/
+
 	// This may or may not be a win when using the IR interpreter...
 	// Many more instructions to interpret.
 	void IRFrontend::Comp_Vmmul(MIPSOpcode op) {
@@ -1035,7 +1061,7 @@ namespace MIPSComp {
 		MatrixOverlapType toverlap = GetMatrixOverlap(vt, vd, sz);
 
 		// A very common arrangment. Rearrange to something we can handle.
-		if (IsMatrixTransposed(vd) && !IsMatrixTransposed(vs) && IsMatrixTransposed(vt)) {
+		if (IsMatrixTransposed(vd)) {
 			// Matrix identity says (At * Bt) = (B * A)t
 			// D = S * T
 			// Dt = (S * T)t = (Tt * St)
@@ -1051,12 +1077,16 @@ namespace MIPSComp {
 		if (soverlap || toverlap) {
 			DISABLE;
 		}
-		if (sz == M_4x4 && IsConsecutive4(tregs) && IsConsecutive4(dregs)) {
+
+		// dregs are always consecutive, thanks to our transpose trick.
+		// However, not sure this is always worth it.
+		if (sz == M_4x4 && IsConsecutive4(dregs)) {
 			// TODO: The interpreter would like proper matrix ops better. Can generate those, and
 			// expand them like this as needed on "real" architectures.
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
 			if (!IsConsecutive4(sregs)) {
+				// METHOD 1: Handles AbC and Abc
 				for (int j = 0; j < 4; j++) {
 					ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[j * 4]);
 					for (int i = 1; i < 4; i++) {
@@ -1066,7 +1096,10 @@ namespace MIPSComp {
 					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
 				}
 				return;
-			} else {
+			} else if (IsConsecutive4(tregs)) {
+				// METHOD 2: Handles ABC only. Not efficient on CPUs that don't do fast dots.
+				// Dots only work if tregs are consecutive.
+				// TODO: Skip this and resort to method one and transpose the output?
 				for (int j = 0; j < 4; j++) {
 					for (int i = 0; i < 4; i++) {
 						ir.Write(IROp::Vec4Dot, s0 + i, sregs[i], tregs[j * 4]);
@@ -1074,10 +1107,11 @@ namespace MIPSComp {
 					ir.Write(IROp::Vec4Mov, dregs[j * 4], s0);
 				}
 				return;
+			} else {
+				// ABc - s consecutive, t not.
+				// Tekken uses this.
+				// logBlocks = 1;
 			}
-		} else if (sz == M_4x4) {
-			// Tekken 6 has a case here: MEE
-			// logBlocks = 1;
 		}
 
 		// Fallback. Expands a LOT
@@ -1126,44 +1160,50 @@ namespace MIPSComp {
 		if (msz == M_4x4 && IsConsecutive4(sregs)) {
 			int s0 = IRVTEMP_0;
 			int s1 = IRVTEMP_PFX_T;
-			if (!IsConsecutive4(tregs)) {
-				ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
-				for (int i = 1; i < 4; i++) {
-					if (!homogenous || (i != n - 1)) {
-						ir.Write(IROp::Vec4Scale, s1, sregs[i * 4], tregs[i]);
-						ir.Write(IROp::Vec4Add, s0, s0, s1);
-					} else {
-						ir.Write(IROp::Vec4Add, s0, s0, sregs[i * 4]);
-					}
-				}
-
-				if (IsConsecutive4(dregs)) {
-					ir.Write(IROp::Vec4Mov, dregs[0], s0);
+			// For this algorithm, we don't care if tregs are consecutive or not,
+			// they are accessed one at a time. This handles homogenous transforms correctly, as well.
+			ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
+			for (int i = 1; i < 4; i++) {
+				if (!homogenous || (i != n - 1)) {
+					ir.Write(IROp::Vec4Scale, s1, sregs[i * 4], tregs[i]);
+					ir.Write(IROp::Vec4Add, s0, s0, s1);
 				} else {
-					for (int i = 0; i < 4; i++) {
-						ir.Write(IROp::FMov, dregs[i], s0 + i);
-					}
+					ir.Write(IROp::Vec4Add, s0, s0, sregs[i * 4]);
 				}
-				return;
-			} else if (!homogenous) {
+			}
+			if (IsConsecutive4(dregs)) {
+				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+			} else {
 				for (int i = 0; i < 4; i++) {
-					ir.Write(IROp::Vec4Dot, s0 + i, sregs[i * 4], tregs[0]);
+					ir.Write(IROp::FMov, dregs[i], s0 + i);
 				}
-				if (IsConsecutive4(dregs)) {
-					ir.Write(IROp::Vec4Mov, dregs[0], s0);
+			}
+			return;
+		} else if (msz == M_4x4 && !IsConsecutive4(sregs)) {
+			int s0 = IRVTEMP_0;
+			int s1 = IRVTEMP_PFX_S;
+			// Doesn't make complete sense to me why this works....
+			ir.Write(IROp::Vec4Scale, s0, sregs[0], tregs[0]);
+			for (int i = 1; i < 4; i++) {
+				if (!homogenous || (i != n - 1)) {
+					ir.Write(IROp::Vec4Scale, s1, sregs[i], tregs[i]);
+					ir.Write(IROp::Vec4Add, s0, s0, s1);
 				} else {
-					for (int i = 0; i < 4; i++) {
-						ir.Write(IROp::FMov, dregs[i], s0 + i);
-					}
+					ir.Write(IROp::Vec4Add, s0, s0, sregs[i]);
 				}
-				return;
 			}
-		} else if (msz == M_4x4) {
-			// logBlocks = 1;
+			if (IsConsecutive4(dregs)) {
+				ir.Write(IROp::Vec4Mov, dregs[0], s0);
+			} else {
+				for (int i = 0; i < 4; i++) {
+					ir.Write(IROp::FMov, dregs[i], s0 + i);
+				}
+			}
+			return;
 		}
 
 		// TODO: test overlap, optimize.
-		int tempregs[4];
+		u8 tempregs[4];
 		int s0 = IRVTEMP_0;
 		int temp1 = IRVTEMP_0 + 1;
 		for (int i = 0; i < n; i++) {
@@ -1216,7 +1256,7 @@ namespace MIPSComp {
 		GetVectorRegs(tregs, sz, _VT);
 		GetVectorRegs(dregs, sz, _VD);
 
-		int tempregs[4];
+		u8 tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
 				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
@@ -1383,7 +1423,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixS(sregs, sz, _VS);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		int tempregs[4];
+		u8 tempregs[4];
 		for (int i = 0; i < n; ++i) {
 			if (!IsOverlapSafe(dregs[i], n, sregs)) {
 				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index eb0b892b06f4..04c3ea15241d 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -4,6 +4,27 @@
 #include "Core/MIPS/IR/IRPassSimplify.h"
 #include "Core/MIPS/IR/IRRegCache.h"
 
+void WriteInstWithConstants(const IRWriter &in, IRWriter &out, const u32 *constants, IRInst inst) {
+	// Remap constants to the new reality
+	const IRMeta *m = GetIRMeta(inst.op);
+	switch (m->types[0]) {
+	case 'C':
+		inst.dest = out.AddConstant(constants[inst.dest]);
+		break;
+	}
+	switch (m->types[1]) {
+	case 'C':
+		inst.src1 = out.AddConstant(constants[inst.src1]);
+		break;
+	}
+	switch (m->types[2]) {
+	case 'C':
+		inst.src2 = out.AddConstant(constants[inst.src2]);
+		break;
+	}
+	out.Write(inst);
+}
+
 u32 Evaluate(u32 a, u32 b, IROp op) {
 	switch (op) {
 	case IROp::Add: case IROp::AddConst: return a + b;
@@ -96,12 +117,7 @@ bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWri
 }
 
 bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
-	//FMovToGPR a0, f12
-	//FMovFromGPR f14, a0
-	// to
-	//FMovToGPR a0, f12
-	//FMov f14, f12
-
+	const u32 *constants = in.GetConstants().data();
 	bool logBlocks = false;
 	IRInst prev;
 	prev.op = IROp::Nop;
@@ -109,6 +125,11 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 		IRInst inst = in.GetInstructions()[i];
 		switch (inst.op) {
 		case IROp::FMovFromGPR:
+			//FMovToGPR a0, f12
+			//FMovFromGPR f14, a0
+			// to
+			//FMovToGPR a0, f12
+			//FMov f14, f12
 			if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) {
 				inst.op = IROp::FMov;
 				inst.src1 = prev.src1;
@@ -118,16 +139,32 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 			}
 			break;
 
-		default:
+		// This will need to scan forward or keep track of more information to be useful.
+		// Just doing one isn't.
+		/*
+		case IROp::LoadVec4:
+			// AddConst a0, sp, 0x30
+			// LoadVec4 v16, a0, 0x0
+			// to
+			// AddConst a0, sp, 0x30
+			// LoadVec4 v16, sp, 0x30 
+			if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) {
+				inst.src2 = out.AddConstant(constants[prev.src2] + constants[inst.src2]);
+				inst.src1 = prev.src1;
+				logBlocks = 1;
+			} else {
+				goto doDefault;
+			}
 			out.Write(inst);
 			break;
+		*/
+		default:
+		doDefault:
+			WriteInstWithConstants(in, out, constants, inst);
+			break;
 		}
 		prev = inst;
 	}
-	// Can reuse the old constants array - not touching constants in this pass.
-	for (u32 value : in.GetConstants()) {
-		out.AddConstant(value);
-	}
 	return logBlocks;
 }
 
@@ -495,24 +532,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out) {
 		doDefaultAndFlush:
 			gpr.FlushAll();
 		doDefault:
-			// Remap constants to the new reality
-			const IRMeta *m = GetIRMeta(inst.op);
-			switch (m->types[0]) {
-			case 'C':
-				inst.dest = out.AddConstant(constants[inst.dest]);
-				break;
-			}
-			switch (m->types[1]) {
-			case 'C':
-				inst.src1 = out.AddConstant(constants[inst.src1]);
-				break;
-			}
-			switch (m->types[2]) {
-			case 'C':
-				inst.src2 = out.AddConstant(constants[inst.src2]);
-				break;
-			}
-			out.Write(inst);
+			WriteInstWithConstants(in, out, constants, inst);
 			break;
 		}
 		}

From 91bc3c31a58a12557a532b09cb81f78ddca75c54 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Sat, 14 May 2016 14:01:27 +0200
Subject: [PATCH 77/77] Warning fixes

---
 Core/MIPS/IR/IRPassSimplify.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index 04c3ea15241d..6b29efc24247 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -121,6 +121,9 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 	bool logBlocks = false;
 	IRInst prev;
 	prev.op = IROp::Nop;
+	prev.dest = 0;
+	prev.src1 = 0;
+	prev.src2 = 0;
 	for (int i = 0; i < (int)in.GetInstructions().size(); i++) {
 		IRInst inst = in.GetInstructions()[i];
 		switch (inst.op) {
@@ -159,7 +162,6 @@ bool OptimizeFPMoves(const IRWriter &in, IRWriter &out) {
 			break;
 		*/
 		default:
-		doDefault:
 			WriteInstWithConstants(in, out, constants, inst);
 			break;
 		}