diff --git a/CMakeLists.txt b/CMakeLists.txt
index b55e2390b..2b456f4a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,7 @@ include_directories(third_party/stb)
 include_directories(third_party/opengl)
 include_directories(third_party/miniaudio)
 include_directories(third_party/mio/single_include)
+include_directories(third_party/lockfree)
 
 add_compile_definitions(NOMINMAX)             # Make windows.h not define min/max macros because third-party deps don't like it
 add_compile_definitions(WIN32_LEAN_AND_MEAN)  # Make windows.h not include literally everything
@@ -301,7 +302,7 @@ if(ENABLE_QT_GUI)
         set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/context_wgl.cpp)
     else()
         set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/context_egl.cpp third_party/duckstation/gl/context_egl_wayland.cpp
-        third_party/duckstation/gl/context_egl_x11.cpp third_party/duckstation/gl/context_glx.cpp third_party/duckstation/gl/x11_window.cpp)
+        third_party/duckstation/gl/context_egl_x11.cpp third_party/duckstation/gl/x11_window.cpp)
     endif()
 endif()
 
@@ -325,14 +326,14 @@ if(ENABLE_OPENGL)
     set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
         include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
         include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/async_compiler.hpp
     )
 
     set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
         src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp
-        src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag
-        src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert
-        src/host_shaders/opengl_fragment_shader.frag
+        src/core/renderer_gl/gl_state.cpp src/core/renderer_gl/async_compiler.cpp
+        src/host_shaders/opengl_display.frag src/host_shaders/opengl_display.vert
+        src/host_shaders/opengl_vertex_shader.vert src/host_shaders/opengl_fragment_shader.frag
     )
 
     set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
diff --git a/include/PICA/pica_frag_config.hpp b/include/PICA/pica_frag_config.hpp
index 5d5f84205..114b76f12 100644
--- a/include/PICA/pica_frag_config.hpp
+++ b/include/PICA/pica_frag_config.hpp
@@ -115,7 +115,7 @@ namespace PICA {
 			bumpSelector = Helpers::getBits<22, 2>(config0);
 			clampHighlights = Helpers::getBit<27>(config0);
 			bumpMode = Helpers::getBits<28, 2>(config0);
-			bumpRenorm = Helpers::getBit<30>(config0) ^ 1; // 0 = enable so flip it with xor
+			bumpRenorm = Helpers::getBit<30>(config0) ^ 1;  // 0 = enable so flip it with xor
 
 			for (int i = 0; i < totalLightCount; i++) {
 				auto& light = lights[i];
@@ -206,6 +206,27 @@ namespace PICA {
 			return std::memcmp(this, &config, sizeof(FragmentConfig)) == 0;
 		}
 
+		FragmentConfig& operator=(const FragmentConfig& config) {
+			// BitField copy constructor is deleted for reasons, so we have to do this manually
+			outConfig.raw = config.outConfig.raw;
+			texConfig = config.texConfig;
+			fogConfig.raw = config.fogConfig.raw;
+			lighting.raw = config.lighting.raw;
+			for (int i = 0; i < 7; i++) {
+				lighting.luts[i].raw = config.lighting.luts[i].raw;
+			}
+			for (int i = 0; i < 8; i++) {
+				lighting.lights[i].raw = config.lighting.lights[i].raw;
+			}
+
+			// If this fails you probably added a new field to the struct and forgot to update the copy constructor
+			static_assert(
+				sizeof(FragmentConfig) == sizeof(outConfig.raw) + sizeof(texConfig) + sizeof(fogConfig.raw) + sizeof(lighting.raw) +
+											  7 * sizeof(LightingLUTConfig) + 8 * sizeof(Light)
+			);
+			return *this;
+		}
+
 		FragmentConfig(const std::array<u32, 0x300>& regs) : lighting(regs) {
 			auto alphaTestConfig = regs[InternalRegs::AlphaTestConfig];
 			auto alphaTestFunction = Helpers::getBits<4, 3>(alphaTestConfig);
diff --git a/include/config.hpp b/include/config.hpp
index 52be1af7e..8cf00b617 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -13,17 +13,17 @@ struct EmulatorConfig {
 	static constexpr bool shaderJitDefault = false;
 #endif
 
-	// For now, use specialized shaders by default on MacOS as M1 drivers are buggy when using the ubershader, and on Android since mobile GPUs are
-	// horrible. On other platforms we default to ubershader + shadergen fallback for lights
+// For now, use specialized shaders by default on MacOS as M1 drivers are buggy when using the ubershader, and on Android since mobile GPUs are
+// horrible. On other platforms we default to ubershader + shadergen fallback for lights
 #if defined(__ANDROID__) || defined(__APPLE__)
-	static constexpr bool ubershaderDefault = false;
+	static constexpr ShaderMode defaultShaderMode = ShaderMode::Specialized;
 #else
-	static constexpr bool ubershaderDefault = true;
+	static constexpr ShaderMode defaultShaderMode = ShaderMode::Ubershader;
 #endif
 
 	bool shaderJitEnabled = shaderJitDefault;
 	bool discordRpcEnabled = false;
-	bool useUbershaders = ubershaderDefault;
+	ShaderMode shaderMode = defaultShaderMode;
 	bool accurateShaderMul = false;
 
 	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
diff --git a/include/emulator.hpp b/include/emulator.hpp
index de04648ea..6e60d9fa5 100644
--- a/include/emulator.hpp
+++ b/include/emulator.hpp
@@ -55,7 +55,7 @@ class Emulator {
 	static constexpr u32 width = 400;
 	static constexpr u32 height = 240 * 2;  // * 2 because 2 screens
 	ROMType romType = ROMType::None;
-	bool running = false;         // Is the emulator running a game?
+	bool running = false;  // Is the emulator running a game?
 
   private:
 #ifdef PANDA3DS_ENABLE_HTTP_SERVER
@@ -109,7 +109,7 @@ class Emulator {
 
 #ifdef PANDA3DS_FRONTEND_QT
 	// For passing the GL context from Qt to the renderer
-	void initGraphicsContext(GL::Context* glContext) { gpu.initGraphicsContext(nullptr); }
+	void initGraphicsContext(GL::Context* glContext) { gpu.initGraphicsContext(glContext); }
 #else
 	void initGraphicsContext(SDL_Window* window) { gpu.initGraphicsContext(window); }
 #endif
diff --git a/include/renderer.hpp b/include/renderer.hpp
index 569a730b7..3b6606bfb 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -1,8 +1,8 @@
 #pragma once
 #include <array>
+#include <optional>
 #include <span>
 #include <string>
-#include <optional>
 
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
@@ -20,6 +20,12 @@ enum class RendererType : s8 {
 	Software = 3,
 };
 
+enum class ShaderMode {
+	Specialized,
+	Ubershader,
+	Hybrid,
+};
+
 struct EmulatorConfig;
 class GPU;
 struct SDL_Window;
@@ -56,6 +62,8 @@ class Renderer {
 	static constexpr u32 vertexBufferSize = 0x10000;
 	static std::optional<RendererType> typeFromString(std::string inString);
 	static const char* typeToString(RendererType rendererType);
+	static std::optional<ShaderMode> shaderModeFromString(std::string inString);
+	static const char* shaderModeToString(ShaderMode shaderMode);
 
 	virtual void reset() = 0;
 	virtual void display() = 0;                                                              // Display the 3DS screen contents to the window
@@ -77,7 +85,7 @@ class Renderer {
 	virtual std::string getUbershader() { return ""; }
 	virtual void setUbershader(const std::string& shader) {}
 
-	virtual void setUbershaderSetting(bool value) {}
+	virtual void setShaderMode(ShaderMode shaderMode) {}
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/async_compiler.hpp b/include/renderer_gl/async_compiler.hpp
new file mode 100644
index 000000000..6635cb9e5
--- /dev/null
+++ b/include/renderer_gl/async_compiler.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <atomic>
+#include <thread>
+
+#include "PICA/pica_frag_config.hpp"
+#include "lockfree/spsc/queue.hpp"
+#include "opengl.hpp"
+#include "renderer_gl/renderer_gl.hpp"
+
+namespace PICA::ShaderGen {
+	class FragmentGenerator;
+}
+
+namespace AsyncCompiler {
+	void* createContext(void* userdata);
+	void makeCurrent(void* userdata, void* context);
+	void destroyContext(void* context);
+}  // namespace AsyncCompiler
+
+struct CompilingProgram {
+	CachedProgram* program;
+	PICA::FragmentConfig* config;
+};
+
+struct AsyncCompilerThread {
+	explicit AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata);
+	~AsyncCompilerThread();
+
+	// Called from the emulator thread to queue a fragment configuration for compilation
+	// Returns false if the queue is full, true otherwise
+	void PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram);
+
+	// Wait for all queued fragment configurations to be compiled
+	void Finish();
+
+  private:
+	PICA::ShaderGen::FragmentGenerator& fragShaderGen;
+	OpenGL::Shader defaultShadergenVs;
+
+	// Our lockfree queue only allows for trivial types, so we preallocate enough structs
+	// to avoid dynamic allocation on each push
+	int preallocatedProgramsIndex;
+	static constexpr int preallocatedProgramsSize = 256;
+	std::array<CompilingProgram*, preallocatedProgramsSize> preallocatedPrograms;
+	lockfree::spsc::Queue<CompilingProgram*, preallocatedProgramsSize - 1> programQueue;
+	std::atomic_bool running;
+	std::atomic_bool hasWork;
+	std::thread thread;
+};
\ No newline at end of file
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 42b8bba1a..819e00f58 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -12,6 +12,7 @@
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "config.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@@ -22,6 +23,15 @@
 // More circular dependencies!
 class GPU;
 
+// Cached recompiled fragment shader
+struct CachedProgram {
+	OpenGL::Program program;
+	std::atomic_bool compiling = false;
+	bool needsInitialization = true;
+};
+
+struct AsyncCompilerThread;
+
 class RendererGL final : public Renderer {
 	GLStateManager gl = {};
 
@@ -30,9 +40,9 @@ class RendererGL final : public Renderer {
 
 	OpenGL::VertexArray vao;
 	OpenGL::VertexBuffer vbo;
-	bool enableUbershader = true;
+	ShaderMode shaderMode = EmulatorConfig::defaultShaderMode;
 
-	// Data 
+	// Data
 	struct {
 		// TEV configuration uniform locations
 		GLint textureEnvSourceLoc = -1;
@@ -71,12 +81,10 @@ class RendererGL final : public Renderer {
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;
 
-	// Cached recompiled fragment shader
-	struct CachedProgram {
-		OpenGL::Program program;
-	};
 	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
 
+	AsyncCompilerThread* asyncCompiler = nullptr;
+
 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
 	OpenGL::Program& getSpecializedShader();
@@ -104,15 +112,15 @@ class RendererGL final : public Renderer {
 	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;  // Clear a GPU buffer in VRAM
 	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;  // Perform display transfer
 	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
-	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;             // Draw the given vertices
+	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;  // Draw the given vertices
 	void deinitGraphicsContext() override;
 
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
 
-	virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
-	
+	virtual void setShaderMode(ShaderMode mode) override { shaderMode = mode; }
+
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
@@ -122,7 +130,7 @@ class RendererGL final : public Renderer {
 	void initUbershader(OpenGL::Program& program);
 
 #ifdef PANDA3DS_FRONTEND_QT
-	virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override { initGraphicsContextInternal(); }
+	void initGraphicsContext(GL::Context* context) override;
 #endif
 
 	// Take a screenshot of the screen and store it in a file
diff --git a/src/config.cpp b/src/config.cpp
index dae5a0ab0..9cf6ef67b 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -60,9 +60,18 @@ void EmulatorConfig::load() {
 				rendererType = RendererType::OpenGL;
 			}
 
+			auto shaderModeName = toml::find_or<std::string>(gpu, "ShaderMode", Renderer::shaderModeToString(defaultShaderMode));
+			auto configShaderMode = Renderer::shaderModeFromString(shaderModeName);
+
+			if (configShaderMode.has_value()) {
+				shaderMode = configShaderMode.value();
+			} else {
+				Helpers::warn("Invalid shader mode specified: %s\n", shaderModeName.c_str());
+				shaderMode = defaultShaderMode;
+			}
+
 			shaderJitEnabled = toml::find_or<toml::boolean>(gpu, "EnableShaderJIT", shaderJitDefault);
 			vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
-			useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
 			accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
 
 			forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
@@ -127,12 +136,12 @@ void EmulatorConfig::save() {
 	data["General"]["EnableDiscordRPC"] = discordRpcEnabled;
 	data["General"]["UsePortableBuild"] = usePortableBuild;
 	data["General"]["DefaultRomPath"] = defaultRomPath.string();
-	
+
 	data["GPU"]["EnableShaderJIT"] = shaderJitEnabled;
 	data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType));
 	data["GPU"]["EnableVSync"] = vsyncEnabled;
 	data["GPU"]["AccurateShaderMultiplication"] = accurateShaderMul;
-	data["GPU"]["UseUbershaders"] = useUbershaders;
+	data["GPU"]["ShaderMode"] = std::string(Renderer::shaderModeToString(shaderMode));
 	data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
 	data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
 
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index fe336edc8..7d486b28b 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -117,7 +117,7 @@ void GPU::reset() {
 	externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
 	externalRegs[Framebuffer1Select] = 0;
 
-	renderer->setUbershaderSetting(config.useUbershaders);
+	renderer->setShaderMode(config.shaderMode);
 	renderer->reset();
 }
 
@@ -365,7 +365,7 @@ PICA::Vertex GPU::getImmediateModeVertex() {
 
 	// Run VS and return vertex data. TODO: Don't hardcode offsets for each attribute
 	shaderUnit.vs.run();
-	
+
 	// Map shader outputs to fixed function properties
 	const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
 	for (int i = 0; i < totalShaderOutputs; i++) {
diff --git a/src/core/renderer_gl/async_compiler.cpp b/src/core/renderer_gl/async_compiler.cpp
new file mode 100644
index 000000000..d00d589bf
--- /dev/null
+++ b/src/core/renderer_gl/async_compiler.cpp
@@ -0,0 +1,72 @@
+#include "renderer_gl/async_compiler.hpp"
+
+AsyncCompilerThread::AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata) : fragShaderGen(fragShaderGen) {
+	preallocatedProgramsIndex = 0;
+	running.store(true);
+
+	for (int i = 0; i < preallocatedProgramsSize; i++) {
+		preallocatedPrograms[i] = new CompilingProgram();
+		preallocatedPrograms[i]->config = new PICA::FragmentConfig({});
+	}
+
+	// The context needs to be created on the main thread so that we can make it shared with that
+	// thread's context
+	void* context = AsyncCompiler::createContext(userdata);
+	thread = std::thread([this, userdata, context]() {
+		AsyncCompiler::makeCurrent(userdata, context);
+		printf("Async compiler started, GL version: %s\n", glGetString(GL_VERSION));
+
+		std::string defaultShadergenVSSource = this->fragShaderGen.getDefaultVertexShader();
+		defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
+
+		while (running.load()) {
+			CompilingProgram* item;
+			while (programQueue.Pop(item)) {
+				OpenGL::Program& glProgram = item->program->program;
+				std::string fs = this->fragShaderGen.generate(*item->config);
+				OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
+				glProgram.create({defaultShadergenVs, fragShader});
+				item->program->compiling.store(false);
+				fragShader.free();
+			}
+
+			hasWork.store(false);
+			std::this_thread::sleep_for(std::chrono::milliseconds(10));
+		}
+
+		AsyncCompiler::destroyContext(context);
+	});
+}
+
+AsyncCompilerThread::~AsyncCompilerThread() {
+	running.store(false);
+	thread.join();
+
+	for (int i = 0; i < preallocatedProgramsSize; i++) {
+		delete preallocatedPrograms[i]->config;
+		delete preallocatedPrograms[i];
+	}
+}
+
+void AsyncCompilerThread::PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram) {
+	CompilingProgram* newProgram = preallocatedPrograms[preallocatedProgramsIndex];
+	newProgram->program = cachedProgram;
+	*newProgram->config = config;
+	preallocatedProgramsIndex = (preallocatedProgramsIndex + 1) % preallocatedProgramsSize;
+	bool pushed = programQueue.Push(newProgram);
+
+	if (!pushed) {
+		Helpers::warn("AsyncCompilerThread: Queue full, spinning");
+
+		while (!pushed) {
+			pushed = programQueue.Push(newProgram);
+		}
+	}
+}
+
+void AsyncCompilerThread::Finish() {
+	hasWork.store(true);
+
+	// Wait for the compiler thread to finish any outstanding work
+	while (hasWork.load()) {}
+}
\ No newline at end of file
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index f8fc31e7d..d0fc33851 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -9,6 +9,7 @@
 #include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
+#include "renderer_gl/async_compiler.hpp"
 #include "math_util.hpp"
 
 CMRC_DECLARE(RendererGL);
@@ -172,9 +173,23 @@ void RendererGL::initGraphicsContextInternal() {
 	defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
 }
 
-// The OpenGL renderer doesn't need to do anything with the GL context (For Qt frontend) or the SDL window (For SDL frontend)
-// So we just call initGraphicsContextInternal for both
-void RendererGL::initGraphicsContext([[maybe_unused]] SDL_Window* window) { initGraphicsContextInternal(); }
+void RendererGL::initGraphicsContext(SDL_Window* context) {
+	if (shaderMode == ShaderMode::Hybrid) {
+		asyncCompiler = new AsyncCompilerThread(fragShaderGen, context);
+	}
+
+	initGraphicsContextInternal();
+}
+
+#ifdef PANDA3DS_FRONTEND_QT
+void RendererGL::initGraphicsContext(GL::Context* context) {
+	if (shaderMode == ShaderMode::Hybrid) {
+		asyncCompiler = new AsyncCompilerThread(fragShaderGen, context);
+	}
+
+	initGraphicsContextInternal();
+}
+#endif
 
 // Set up the OpenGL blending context to match the emulated PICA
 void RendererGL::setupBlending() {
@@ -414,23 +429,38 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};
 
-	bool usingUbershader = enableUbershader;
-	if (usingUbershader) {
+	if (shaderMode == ShaderMode::Ubershader) {
 		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
 		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
 
 		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
 		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
 		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
-			usingUbershader = false;
+			OpenGL::Program& program = getSpecializedShader();
+			gl.useProgram(program);
+		} else {
+			gl.useProgram(triangleProgram);
 		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
-	} else {
+	} else if (shaderMode == ShaderMode::Specialized) {
 		OpenGL::Program& program = getSpecializedShader();
 		gl.useProgram(program);
+	} else if (shaderMode == ShaderMode::Hybrid) {
+		PICA::FragmentConfig fsConfig(regs);
+		auto cachedProgram = shaderCache.find(fsConfig);
+
+		if (cachedProgram == shaderCache.end()) {
+			CachedProgram& program = shaderCache[fsConfig];
+			program.compiling.store(true);
+			asyncCompiler->PushFragmentConfig(fsConfig, &program);
+			gl.useProgram(triangleProgram);
+		} else if (cachedProgram->second.compiling.load(std::memory_order_relaxed)) {
+			gl.useProgram(triangleProgram);
+		} else {
+			OpenGL::Program& program = getSpecializedShader();
+			gl.useProgram(program);
+		}
+	} else {
+		Helpers::panic("Invalid shader mode");
 	}
 
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
@@ -458,7 +488,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
 
 	// Update ubershader uniforms
-	if (usingUbershader) {
+	if (gl.currentProgram == triangleProgram.handle()) {
 		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
 		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
 		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
@@ -844,14 +874,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	OpenGL::Program& program = programEntry.program;
 
 	if (!program.exists()) {
+		if (shaderMode == ShaderMode::Hybrid) [[unlikely]] {
+			Helpers::panic("Compiling shaders in main thread, this should never happen");
+		}
+
 		std::string fs = fragShaderGen.generate(fsConfig);
 
 		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
 		program.create({defaultShadergenVs, fragShader});
-		gl.useProgram(program);
 
 		fragShader.free();
+	}
 
+	if (programEntry.needsInitialization) {
+		gl.useProgram(program);
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
@@ -862,6 +898,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
 		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
 		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		programEntry.needsInitialization = false;
 	}
 	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);
 
@@ -979,6 +1016,11 @@ void RendererGL::screenshot(const std::string& name) {
 }
 
 void RendererGL::clearShaderCache() {
+	if (asyncCompiler != nullptr && shaderMode == ShaderMode::Hybrid) {
+		// May contain objects that are still in use, so we need to clear them first
+		asyncCompiler->Finish();
+	}
+
 	for (auto& shader : shaderCache) {
 		CachedProgram& cachedProgram = shader.second;
 		cachedProgram.program.free();
diff --git a/src/hydra_core.cpp b/src/hydra_core.cpp
index acbf30a8b..1b1ec0265 100644
--- a/src/hydra_core.cpp
+++ b/src/hydra_core.cpp
@@ -162,3 +162,12 @@ HC_API const char* getInfo(hydra::InfoType type) {
 		default: return nullptr;
 	}
 }
+
+namespace AsyncCompiler {
+	void* createContext(void* mainContext) {
+		return nullptr;
+	}
+
+	void makeCurrent(void* mainContext, void* context) {}
+	void destroyContext(void* context) {}
+}  // namespace AsyncCompiler
diff --git a/src/jni_driver.cpp b/src/jni_driver.cpp
index e4ce2b399..bdd34470b 100644
--- a/src/jni_driver.cpp
+++ b/src/jni_driver.cpp
@@ -4,10 +4,10 @@
 
 #include <stdexcept>
 
+#include "android_utils.hpp"
 #include "emulator.hpp"
 #include "renderer_gl/renderer_gl.hpp"
 #include "services/hid.hpp"
-#include "android_utils.hpp"
 
 std::unique_ptr<Emulator> emulator = nullptr;
 HIDService* hidService = nullptr;
@@ -40,17 +40,17 @@ JNIEnv* jniEnv() {
 extern "C" {
 
 #define MAKE_SETTING(functionName, type, settingName) \
-AlberFunction(void, functionName) (JNIEnv* env, jobject obj, type value) { emulator->getConfig().settingName = value; }
+	AlberFunction(void, functionName)(JNIEnv * env, jobject obj, type value) { emulator->getConfig().settingName = value; }
 
 MAKE_SETTING(setShaderJitEnabled, jboolean, shaderJitEnabled)
 
 #undef MAKE_SETTING
 
 AlberFunction(void, Setup)(JNIEnv* env, jobject obj) {
-    env->GetJavaVM(&jvm);
+	env->GetJavaVM(&jvm);
 
-    alberClass = (jclass)env->NewGlobalRef((jclass)env->FindClass("com/panda3ds/pandroid/AlberDriver"));
-    alberClassOpenDocument = env->GetStaticMethodID(alberClass, "openDocument", "(Ljava/lang/String;Ljava/lang/String;)I");
+	alberClass = (jclass)env->NewGlobalRef((jclass)env->FindClass("com/panda3ds/pandroid/AlberDriver"));
+	alberClassOpenDocument = env->GetStaticMethodID(alberClass, "openDocument", "(Ljava/lang/String;Ljava/lang/String;)I");
 }
 
 AlberFunction(void, Pause)(JNIEnv* env, jobject obj) { emulator->pause(); }
@@ -128,15 +128,15 @@ AlberFunction(jbyteArray, GetSmdh)(JNIEnv* env, jobject obj) {
 #undef AlberFunction
 
 int AndroidUtils::openDocument(const char* path, const char* perms) {
-    auto env = jniEnv();
+	auto env = jniEnv();
 
-    jstring uri = env->NewStringUTF(path);
-    jstring jmode = env->NewStringUTF(perms);
+	jstring uri = env->NewStringUTF(path);
+	jstring jmode = env->NewStringUTF(perms);
 
-    jint result = env->CallStaticIntMethod(alberClass, alberClassOpenDocument, uri, jmode);
+	jint result = env->CallStaticIntMethod(alberClass, alberClassOpenDocument, uri, jmode);
 
-    env->DeleteLocalRef(uri);
-    env->DeleteLocalRef(jmode);
+	env->DeleteLocalRef(uri);
+	env->DeleteLocalRef(jmode);
 
-    return (int)result;
-}
\ No newline at end of file
+	return (int)result;
+}
diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp
index b099067fc..b1571df0a 100644
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@@ -150,8 +150,8 @@ static void configInit() {
 	static const retro_variable values[] = {
 		{"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"},
 		{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
-		{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
-																	  : "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
+		{"panda3ds_use_ubershader", EmulatorConfig::defaultShaderMode == ShaderMode::Ubershader ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
+																				: "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
 		{"panda3ds_use_vsync", "Enable VSync; enabled|disabled"},
 		{"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"},
 		{"panda3ds_use_audio", "Enable audio; disabled|enabled"},
@@ -180,7 +180,9 @@ static void configUpdate() {
 	config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true);
 	config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false);
-	config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", true);
+	config.shaderMode = FetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::defaultShaderMode == ShaderMode::Ubershader)
+							? ShaderMode::Ubershader
+							: ShaderMode::Specialized;
 	config.forceShadergenForLights = FetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = std::clamp(std::stoi(FetchVariable("panda3ds_ubershader_lighting_override_threshold", "1")), 1, 8);
 	config.discordRpcEnabled = false;
@@ -403,3 +405,13 @@ void retro_cheat_set(uint index, bool enabled, const char* code) {
 void retro_cheat_reset() {
 	emulator->getCheats().reset();
 }
+
+namespace AsyncCompiler {
+	void* createContext(void* mainContext) {
+		return nullptr;
+	}
+
+	void makeCurrent(void* mainContext, void* context) {}
+
+	void destroyContext(void* context) {}
+}  // namespace AsyncCompiler
diff --git a/src/panda_qt/main_window.cpp b/src/panda_qt/main_window.cpp
index 65769116e..24303d795 100644
--- a/src/panda_qt/main_window.cpp
+++ b/src/panda_qt/main_window.cpp
@@ -6,8 +6,10 @@
 #include <cmath>
 #include <cstdio>
 #include <fstream>
+#include <memory>
 
 #include "cheats.hpp"
+#include "gl/context.h"
 #include "input_mappings.hpp"
 #include "services/dsp.hpp"
 
@@ -601,3 +603,32 @@ void MainWindow::pollControllers() {
 		}
 	}
 }
+
+namespace AsyncCompiler {
+	void* createContext(void* mainContext) {
+		GL::Context* glContext = (GL::Context*)mainContext;
+
+		// Unlike the SDL function, this doesn't make it current so we don't
+		// need to call MakeCurrent on the mainContext
+		WindowInfo wi = glContext->GetWindowInfo();
+		wi.type = WindowInfo::Type::Surfaceless;
+
+		std::unique_ptr<GL::Context> iLoveBeingForcedToUseRAII = glContext->CreateSharedContext(wi);
+
+		if (!iLoveBeingForcedToUseRAII) {
+			Helpers::panic("Failed to create shared GL context");
+		}
+
+		return iLoveBeingForcedToUseRAII.release();
+	}
+
+	void makeCurrent(void* unused, void* context) {
+		GL::Context* glContext = (GL::Context*)context;
+		glContext->MakeCurrent();
+	}
+
+	void destroyContext(void* context) {
+		GL::Context* glContext = (GL::Context*)context;
+		delete glContext;
+	}
+}  // namespace AsyncCompiler
\ No newline at end of file
diff --git a/src/panda_sdl/frontend_sdl.cpp b/src/panda_sdl/frontend_sdl.cpp
index 77b1f55fd..2e806fe20 100644
--- a/src/panda_sdl/frontend_sdl.cpp
+++ b/src/panda_sdl/frontend_sdl.cpp
@@ -35,6 +35,11 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, config.rendererType == RendererType::Software ? 3 : 4);
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, config.rendererType == RendererType::Software ? 3 : 1);
+		
+		if (config.shaderMode == ShaderMode::Hybrid) {
+			SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
+		}
+		
 		window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 400, 480, SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE);
 
 		if (window == nullptr) {
@@ -46,6 +51,16 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp
 			Helpers::panic("OpenGL context creation failed: %s", SDL_GetError());
 		}
 
+		if (config.shaderMode == ShaderMode::Hybrid) {
+			// As per the wiki you should check the value after creating the context
+			// as it can differ from the requested value
+			int sharingEnabled;
+			SDL_GL_GetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, &sharingEnabled);
+			if (!sharingEnabled) {
+				Helpers::panic("OpenGL context sharing not enabled");
+			}
+		}
+
 		if (!gladLoadGLLoader(reinterpret_cast<GLADloadproc>(SDL_GL_GetProcAddress))) {
 			Helpers::panic("OpenGL init failed");
 		}
@@ -342,3 +357,28 @@ void FrontendSDL::run() {
 		SDL_GL_SwapWindow(window);
 	}
 }
+
+namespace AsyncCompiler {
+	void* createContext(void* window) {
+		SDL_Window* sdlWindow = static_cast<SDL_Window*>(window);
+
+		// SDL_GL_CreateContext also makes it the current context so we need to switch back after creation
+		SDL_GLContext currentContext = SDL_GL_GetCurrentContext();
+		SDL_GLContext glContext = SDL_GL_CreateContext(sdlWindow);
+
+		if (glContext == nullptr) {
+			Helpers::panic("OpenGL context creation failed: %s", SDL_GetError());
+		}
+
+		SDL_GL_MakeCurrent(sdlWindow, currentContext);
+		return glContext;
+	}
+
+	void makeCurrent(void* window, void* context) {
+		SDL_GL_MakeCurrent((SDL_Window*)window, (SDL_GLContext)context);
+	}
+
+	void destroyContext(void* context) {
+		SDL_GL_DeleteContext(static_cast<SDL_GLContext>(context));
+	}
+}
\ No newline at end of file
diff --git a/src/renderer.cpp b/src/renderer.cpp
index 76c3e7a02..9399133d1 100644
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -36,4 +36,34 @@ const char* Renderer::typeToString(RendererType rendererType) {
 		case RendererType::Software: return "software";
 		default: return "Invalid";
 	}
+}
+
+std::optional<ShaderMode> Renderer::shaderModeFromString(std::string inString) {
+	// Transform to lower-case to make the setting case-insensitive
+	std::transform(inString.begin(), inString.end(), inString.begin(), [](unsigned char c) { return std::tolower(c); });
+
+	static const std::unordered_map<std::string, ShaderMode> map = {
+		{"specialized", ShaderMode::Specialized},
+		{"special", ShaderMode::Specialized},
+		{"ubershader", ShaderMode::Ubershader},
+		{"uber", ShaderMode::Ubershader},
+		{"hybrid", ShaderMode::Hybrid},
+		{"threaded", ShaderMode::Hybrid},
+		{"i hate opengl context creation", ShaderMode::Hybrid},
+	};
+
+	if (auto search = map.find(inString); search != map.end()) {
+		return search->second;
+	}
+
+	return std::nullopt;
+}
+
+const char* Renderer::shaderModeToString(ShaderMode shaderMode) {
+	switch (shaderMode) {
+		case ShaderMode::Specialized: return "specialized";
+		case ShaderMode::Ubershader: return "ubershader";
+		case ShaderMode::Hybrid: return "hybrid";
+		default: return "Invalid";
+	}
 }
\ No newline at end of file
diff --git a/third_party/duckstation/duckstation_scoped_guard.h b/third_party/duckstation/duckstation_scoped_guard.h
index 89f35d92f..c9e57ddd9 100644
--- a/third_party/duckstation/duckstation_scoped_guard.h
+++ b/third_party/duckstation/duckstation_scoped_guard.h
@@ -19,6 +19,14 @@ class ScopedGuard final
   /// Prevents the function from being invoked when we go out of scope.
   ALWAYS_INLINE void Cancel() { m_func.reset(); }
 
+  /// Runs the destructor function now instead of when we go out of scope.
+  ALWAYS_INLINE void Run() {
+	  if (!m_func.has_value()) return;
+
+	  m_func.value()();
+	  m_func.reset();
+  }
+
   /// Explicitly fires the function.
   ALWAYS_INLINE void Invoke()
   {
diff --git a/third_party/duckstation/gl/context.cpp b/third_party/duckstation/gl/context.cpp
index 69401bd95..e06fc5359 100644
--- a/third_party/duckstation/gl/context.cpp
+++ b/third_party/duckstation/gl/context.cpp
@@ -74,14 +74,7 @@ std::unique_ptr<GL::Context> Context::Create(const WindowInfo& wi, const Version
   context = ContextAGL::Create(wi, versions_to_try, num_versions_to_try);
 #else
   if (wi.type == WindowInfo::Type::X11)
-  {
-    const char* use_egl_x11 = std::getenv("USE_EGL_X11");
-    if (use_egl_x11 && std::strcmp(use_egl_x11, "1") == 0)
-      context = ContextEGLX11::Create(wi, versions_to_try, num_versions_to_try);
-    else
-      context = ContextGLX::Create(wi, versions_to_try, num_versions_to_try);
-  }
-
+    context = ContextEGLX11::Create(wi, versions_to_try, num_versions_to_try);
 #ifdef WAYLAND_ENABLED
   if (wi.type == WindowInfo::Type::Wayland)
     context = ContextEGLWayland::Create(wi, versions_to_try, num_versions_to_try);
diff --git a/third_party/duckstation/gl/context_egl_x11.cpp b/third_party/duckstation/gl/context_egl_x11.cpp
index 6db6c10b1..bb5e40f99 100644
--- a/third_party/duckstation/gl/context_egl_x11.cpp
+++ b/third_party/duckstation/gl/context_egl_x11.cpp
@@ -20,6 +20,7 @@ std::unique_ptr<Context> ContextEGLX11::CreateSharedContext(const WindowInfo& wi
 {
   std::unique_ptr<ContextEGLX11> context = std::make_unique<ContextEGLX11>(wi);
   context->m_display = m_display;
+  context->m_supports_surfaceless = m_supports_surfaceless;
 
   if (!context->CreateContextAndSurface(m_version, m_context, false))
     return nullptr;
diff --git a/third_party/duckstation/gl/context_wgl.cpp b/third_party/duckstation/gl/context_wgl.cpp
index 47ec4b1e8..3837656b5 100644
--- a/third_party/duckstation/gl/context_wgl.cpp
+++ b/third_party/duckstation/gl/context_wgl.cpp
@@ -19,6 +19,17 @@ static void* GetProcAddressCallback(const char* name)
 }
 
 namespace GL {
+static bool ReloadWGL(HDC dc)
+{
+  if (!gladLoadWGL(dc))
+  {
+    Log_ErrorPrint("Loading GLAD WGL functions failed");
+    return false;
+  }
+
+  return true;
+}
+
 ContextWGL::ContextWGL(const WindowInfo& wi) : Context(wi) {}
 
 ContextWGL::~ContextWGL()
@@ -149,8 +160,8 @@ std::unique_ptr<Context> ContextWGL::CreateSharedContext(const WindowInfo& wi)
   }
   else
   {
-    Log_ErrorPrint("PBuffer not implemented");
-    return nullptr;
+    if (!context->CreatePBuffer())
+      return nullptr;
   }
 
   if (m_version.profile == Profile::NoProfile)
@@ -305,6 +316,32 @@ bool ContextWGL::CreatePBuffer()
 
   static constexpr const int pb_attribs[] = {0, 0};
 
+  HGLRC temp_rc = nullptr;
+  ScopedGuard temp_rc_guard([&temp_rc, hdc]() {
+    if (temp_rc)
+    {
+      wglMakeCurrent(hdc, nullptr);
+      wglDeleteContext(temp_rc);
+    }
+  });
+
+  if (!GLAD_WGL_ARB_pbuffer)
+  {
+    // we're probably running completely surfaceless... need a temporary context.
+    temp_rc = wglCreateContext(hdc);
+    if (!temp_rc || !wglMakeCurrent(hdc, temp_rc))
+    {
+      Log_ErrorPrint("Failed to create temporary context to load WGL for pbuffer.");
+      return false;
+    }
+
+    if (!ReloadWGL(hdc) || !GLAD_WGL_ARB_pbuffer)
+    {
+      Log_ErrorPrint("Missing WGL_ARB_pbuffer");
+      return false;
+    }
+  }
+
   AssertMsg(m_pixel_format.has_value(), "Has pixel format for pbuffer");
   HPBUFFERARB pbuffer = wglCreatePbufferARB(hdc, m_pixel_format.value(), 1, 1, pb_attribs);
   if (!pbuffer)
@@ -326,6 +363,7 @@ bool ContextWGL::CreatePBuffer()
   m_dummy_dc = hdc;
   m_pbuffer = pbuffer;
 
+  temp_rc_guard.Run();
   pbuffer_guard.Cancel();
   hdc_guard.Cancel();
   hwnd_guard.Cancel();
diff --git a/third_party/lockfree/LICENSE b/third_party/lockfree/LICENSE
new file mode 100644
index 000000000..2cb6782f8
--- /dev/null
+++ b/third_party/lockfree/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Djordje Nedic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/third_party/lockfree/lockfree/spsc/queue.hpp b/third_party/lockfree/lockfree/spsc/queue.hpp
new file mode 100755
index 000000000..97a8dc3f3
--- /dev/null
+++ b/third_party/lockfree/lockfree/spsc/queue.hpp
@@ -0,0 +1,110 @@
+/**************************************************************
+ * @file queue.hpp
+ * @brief A queue implementation written in standard c++11
+ * suitable for both low-end microcontrollers all the way
+ * to HPC machines. Lock-free for single consumer single
+ * producer scenarios.
+ **************************************************************/
+
+/**************************************************************
+ * Copyright (c) 2023 Djordje Nedic
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, subject to the
+ * following conditions:
+ *
+ * The above copyright notice and this permission notice shall
+ * be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of lockfree
+ *
+ * Author:          Djordje Nedic <nedic.djordje2@gmail.com>
+ * Version:         v2.0.9
+ **************************************************************/
+
+/************************** INCLUDE ***************************/
+#ifndef LOCKFREE_QUEUE_HPP
+#define LOCKFREE_QUEUE_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <type_traits>
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#include <optional>
+#endif
+
+namespace lockfree {
+namespace spsc {
+/*************************** TYPES ****************************/
+
+template <typename T, size_t size> class Queue {
+    static_assert(std::is_trivial<T>::value, "The type T must be trivial");
+    static_assert(size > 2, "Buffer size must be bigger than 2");
+
+    /********************** PUBLIC METHODS ************************/
+  public:
+    Queue();
+
+    /**
+     * @brief Adds an element into the queue.
+     * Should only be called from the producer thread.
+     * @param[in] element
+     * @retval Operation success
+     */
+    bool Push(const T &element);
+
+    /**
+     * @brief Removes an element from the queue.
+     * Should only be called from the consumer thread.
+     * @param[out] element
+     * @retval Operation success
+     */
+    bool Pop(T &element);
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+    /**
+     * @brief Removes an element from the queue.
+     * Should only be called from the consumer thread.
+     * @retval Either the element or nothing
+     */
+    std::optional<T> PopOptional();
+#endif
+
+    /********************** PRIVATE MEMBERS ***********************/
+  private:
+    T _data[size]; /**< Data array */
+#if LOCKFREE_CACHE_COHERENT
+    alignas(LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _r; /**< Read index */
+    alignas(
+        LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _w; /**< Write index */
+#else
+    std::atomic_size_t _r; /**< Read index */
+    std::atomic_size_t _w; /**< Write index */
+#endif
+};
+
+} /* namespace spsc */
+} /* namespace lockfree */
+
+/************************** INCLUDE ***************************/
+
+/* Include the implementation */
+#include "queue_impl.hpp"
+
+#endif /* LOCKFREE_QUEUE_HPP */
diff --git a/third_party/lockfree/lockfree/spsc/queue_impl.hpp b/third_party/lockfree/lockfree/spsc/queue_impl.hpp
new file mode 100644
index 000000000..43654c88e
--- /dev/null
+++ b/third_party/lockfree/lockfree/spsc/queue_impl.hpp
@@ -0,0 +1,111 @@
+/**************************************************************
+ * @file queue_impl.hpp
+ * @brief A queue implementation written in standard c++11
+ * suitable for both low-end microcontrollers all the way
+ * to HPC machines. Lock-free for single consumer single
+ * producer scenarios.
+ **************************************************************/
+
+/**************************************************************
+ * Copyright (c) 2023 Djordje Nedic
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, subject to the
+ * following conditions:
+ *
+ * The above copyright notice and this permission notice shall
+ * be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of lockfree
+ *
+ * Author:          Djordje Nedic <nedic.djordje2@gmail.com>
+ * Version:         v2.0.9
+ **************************************************************/
+
+namespace lockfree {
+namespace spsc {
+/********************** PUBLIC METHODS ************************/
+
+template <typename T, size_t size> Queue<T, size>::Queue() : _r(0U), _w(0U) {}
+
+template <typename T, size_t size> bool Queue<T, size>::Push(const T &element) {
+    /*
+       The full check needs to be performed using the next write index not to
+       miss the case when the read index wrapped and write index is at the end
+     */
+    const size_t w = _w.load(std::memory_order_relaxed);
+    size_t w_next = w + 1;
+    if (w_next == size) {
+        w_next = 0U;
+    }
+
+    /* Full check  */
+    const size_t r = _r.load(std::memory_order_acquire);
+    if (w_next == r) {
+        return false;
+    }
+
+    /* Place the element */
+    _data[w] = element;
+
+    /* Store the next write index */
+    _w.store(w_next, std::memory_order_release);
+    return true;
+}
+
+template <typename T, size_t size> bool Queue<T, size>::Pop(T &element) {
+    /* Preload indexes with adequate memory ordering */
+    size_t r = _r.load(std::memory_order_relaxed);
+    const size_t w = _w.load(std::memory_order_acquire);
+
+    /* Empty check */
+    if (r == w) {
+        return false;
+    }
+
+    /* Remove the element */
+    element = _data[r];
+
+    /* Increment the read index */
+    r++;
+    if (r == size) {
+        r = 0U;
+    }
+
+    /* Store the read index */
+    _r.store(r, std::memory_order_release);
+    return true;
+}
+
+/********************* std::optional API **********************/
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+template <typename T, size_t size>
+std::optional<T> Queue<T, size>::PopOptional() {
+    T element;
+    bool result = Pop(element);
+
+    if (result) {
+        return element;
+    } else {
+        return {};
+    }
+}
+#endif
+
+} /* namespace spsc */
+} /* namespace lockfree */