diff --git a/CMakeLists.txt b/CMakeLists.txt index b55e2390b..2b456f4a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ include_directories(third_party/stb) include_directories(third_party/opengl) include_directories(third_party/miniaudio) include_directories(third_party/mio/single_include) +include_directories(third_party/lockfree) add_compile_definitions(NOMINMAX) # Make windows.h not define min/max macros because third-party deps don't like it add_compile_definitions(WIN32_LEAN_AND_MEAN) # Make windows.h not include literally everything @@ -301,7 +302,7 @@ if(ENABLE_QT_GUI) set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/context_wgl.cpp) else() set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/context_egl.cpp third_party/duckstation/gl/context_egl_wayland.cpp - third_party/duckstation/gl/context_egl_x11.cpp third_party/duckstation/gl/context_glx.cpp third_party/duckstation/gl/x11_window.cpp) + third_party/duckstation/gl/context_egl_x11.cpp third_party/duckstation/gl/x11_window.cpp) endif() endif() @@ -325,14 +326,14 @@ if(ENABLE_OPENGL) set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp - include/renderer_gl/gl_state.hpp + include/renderer_gl/gl_state.hpp include/renderer_gl/async_compiler.hpp ) set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp - src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag - src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert - src/host_shaders/opengl_fragment_shader.frag + src/core/renderer_gl/gl_state.cpp src/core/renderer_gl/async_compiler.cpp + src/host_shaders/opengl_display.frag src/host_shaders/opengl_display.vert + src/host_shaders/opengl_vertex_shader.vert src/host_shaders/opengl_fragment_shader.frag ) set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES}) diff --git a/include/PICA/pica_frag_config.hpp b/include/PICA/pica_frag_config.hpp index 5d5f84205..114b76f12 100644 --- a/include/PICA/pica_frag_config.hpp +++ b/include/PICA/pica_frag_config.hpp @@ -115,7 +115,7 @@ namespace PICA { bumpSelector = Helpers::getBits<22, 2>(config0); clampHighlights = Helpers::getBit<27>(config0); bumpMode = Helpers::getBits<28, 2>(config0); - bumpRenorm = Helpers::getBit<30>(config0) ^ 1; // 0 = enable so flip it with xor + bumpRenorm = Helpers::getBit<30>(config0) ^ 1; // 0 = enable so flip it with xor for (int i = 0; i < totalLightCount; i++) { auto& light = lights[i]; @@ -206,6 +206,27 @@ namespace PICA { return std::memcmp(this, &config, sizeof(FragmentConfig)) == 0; } + FragmentConfig& operator=(const FragmentConfig& config) { + // BitField copy constructor is deleted for reasons, so we have to do this manually + outConfig.raw = config.outConfig.raw; + texConfig = config.texConfig; + fogConfig.raw = config.fogConfig.raw; + lighting.raw = config.lighting.raw; + for (int i = 0; i < 7; i++) { + lighting.luts[i].raw = config.lighting.luts[i].raw; + } + for (int i = 0; i < 8; i++) { + lighting.lights[i].raw = config.lighting.lights[i].raw; + } + + // If this fails you probably added a new field to the struct and forgot to update the copy constructor + static_assert( + sizeof(FragmentConfig) == sizeof(outConfig.raw) + sizeof(texConfig) + sizeof(fogConfig.raw) + sizeof(lighting.raw) + + 7 * sizeof(LightingLUTConfig) + 8 * sizeof(Light) + ); + return *this; + } + FragmentConfig(const std::array& regs) : lighting(regs) { auto alphaTestConfig = regs[InternalRegs::AlphaTestConfig]; auto alphaTestFunction = Helpers::getBits<4, 3>(alphaTestConfig); diff --git a/include/config.hpp b/include/config.hpp index 52be1af7e..8cf00b617 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -13,17 +13,17 @@ struct EmulatorConfig { static constexpr bool shaderJitDefault = false; #endif - // For now, use specialized shaders by default on MacOS as M1 drivers are buggy when using the ubershader, and on Android since mobile GPUs are - // horrible. On other platforms we default to ubershader + shadergen fallback for lights +// For now, use specialized shaders by default on MacOS as M1 drivers are buggy when using the ubershader, and on Android since mobile GPUs are +// horrible. On other platforms we default to ubershader + shadergen fallback for lights #if defined(__ANDROID__) || defined(__APPLE__) - static constexpr bool ubershaderDefault = false; + static constexpr ShaderMode defaultShaderMode = ShaderMode::Specialized; #else - static constexpr bool ubershaderDefault = true; + static constexpr ShaderMode defaultShaderMode = ShaderMode::Ubershader; #endif bool shaderJitEnabled = shaderJitDefault; bool discordRpcEnabled = false; - bool useUbershaders = ubershaderDefault; + ShaderMode shaderMode = defaultShaderMode; bool accurateShaderMul = false; // Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance diff --git a/include/emulator.hpp b/include/emulator.hpp index de04648ea..6e60d9fa5 100644 --- a/include/emulator.hpp +++ b/include/emulator.hpp @@ -55,7 +55,7 @@ class Emulator { static constexpr u32 width = 400; static constexpr u32 height = 240 * 2; // * 2 because 2 screens ROMType romType = ROMType::None; - bool running = false; // Is the emulator running a game? + bool running = false; // Is the emulator running a game? private: #ifdef PANDA3DS_ENABLE_HTTP_SERVER @@ -109,7 +109,7 @@ class Emulator { #ifdef PANDA3DS_FRONTEND_QT // For passing the GL context from Qt to the renderer - void initGraphicsContext(GL::Context* glContext) { gpu.initGraphicsContext(nullptr); } + void initGraphicsContext(GL::Context* glContext) { gpu.initGraphicsContext(glContext); } #else void initGraphicsContext(SDL_Window* window) { gpu.initGraphicsContext(window); } #endif diff --git a/include/renderer.hpp b/include/renderer.hpp index 569a730b7..3b6606bfb 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -1,8 +1,8 @@ #pragma once #include +#include #include #include -#include #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" @@ -20,6 +20,12 @@ enum class RendererType : s8 { Software = 3, }; +enum class ShaderMode { + Specialized, + Ubershader, + Hybrid, +}; + struct EmulatorConfig; class GPU; struct SDL_Window; @@ -56,6 +62,8 @@ class Renderer { static constexpr u32 vertexBufferSize = 0x10000; static std::optional typeFromString(std::string inString); static const char* typeToString(RendererType rendererType); + static std::optional shaderModeFromString(std::string inString); + static const char* shaderModeToString(ShaderMode shaderMode); virtual void reset() = 0; virtual void display() = 0; // Display the 3DS screen contents to the window @@ -77,7 +85,7 @@ class Renderer { virtual std::string getUbershader() { return ""; } virtual void setUbershader(const std::string& shader) {} - virtual void setUbershaderSetting(bool value) {} + virtual void setShaderMode(ShaderMode shaderMode) {} // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/async_compiler.hpp b/include/renderer_gl/async_compiler.hpp new file mode 100644 index 000000000..6635cb9e5 --- /dev/null +++ b/include/renderer_gl/async_compiler.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +#include "PICA/pica_frag_config.hpp" +#include "lockfree/spsc/queue.hpp" +#include "opengl.hpp" +#include "renderer_gl/renderer_gl.hpp" + +namespace PICA::ShaderGen { + class FragmentGenerator; +} + +namespace AsyncCompiler { + void* createContext(void* userdata); + void makeCurrent(void* userdata, void* context); + void destroyContext(void* context); +} // namespace AsyncCompiler + +struct CompilingProgram { + CachedProgram* program; + PICA::FragmentConfig* config; +}; + +struct AsyncCompilerThread { + explicit AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata); + ~AsyncCompilerThread(); + + // Called from the emulator thread to queue a fragment configuration for compilation + // Returns false if the queue is full, true otherwise + void PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram); + + // Wait for all queued fragment configurations to be compiled + void Finish(); + + private: + PICA::ShaderGen::FragmentGenerator& fragShaderGen; + OpenGL::Shader defaultShadergenVs; + + // Our lockfree queue only allows for trivial types, so we preallocate enough structs + // to avoid dynamic allocation on each push + int preallocatedProgramsIndex; + static constexpr int preallocatedProgramsSize = 256; + std::array preallocatedPrograms; + lockfree::spsc::Queue programQueue; + std::atomic_bool running; + std::atomic_bool hasWork; + std::thread thread; +}; \ No newline at end of file diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 42b8bba1a..819e00f58 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -12,6 +12,7 @@ #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" #include "PICA/shader_gen.hpp" +#include "config.hpp" #include "gl_state.hpp" #include "helpers.hpp" #include "logger.hpp" @@ -22,6 +23,15 @@ // More circular dependencies! class GPU; +// Cached recompiled fragment shader +struct CachedProgram { + OpenGL::Program program; + std::atomic_bool compiling = false; + bool needsInitialization = true; +}; + +struct AsyncCompilerThread; + class RendererGL final : public Renderer { GLStateManager gl = {}; @@ -30,9 +40,9 @@ class RendererGL final : public Renderer { OpenGL::VertexArray vao; OpenGL::VertexBuffer vbo; - bool enableUbershader = true; + ShaderMode shaderMode = EmulatorConfig::defaultShaderMode; - // Data + // Data struct { // TEV configuration uniform locations GLint textureEnvSourceLoc = -1; @@ -71,12 +81,10 @@ class RendererGL final : public Renderer { OpenGL::Shader defaultShadergenVs; GLuint shadergenFragmentUBO; - // Cached recompiled fragment shader - struct CachedProgram { - OpenGL::Program program; - }; std::unordered_map shaderCache; + AsyncCompilerThread* asyncCompiler = nullptr; + OpenGL::Framebuffer getColourFBO(); OpenGL::Texture getTexture(Texture& tex); OpenGL::Program& getSpecializedShader(); @@ -104,15 +112,15 @@ class RendererGL final : public Renderer { void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override; // Clear a GPU buffer in VRAM void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override; // Perform display transfer void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override; - void drawVertices(PICA::PrimType primType, std::span vertices) override; // Draw the given vertices + void drawVertices(PICA::PrimType primType, std::span vertices) override; // Draw the given vertices void deinitGraphicsContext() override; virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - virtual void setUbershaderSetting(bool value) override { enableUbershader = value; } - + virtual void setShaderMode(ShaderMode mode) override { shaderMode = mode; } + std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); // Note: The caller is responsible for deleting the currently bound FBO before calling this @@ -122,7 +130,7 @@ class RendererGL final : public Renderer { void initUbershader(OpenGL::Program& program); #ifdef PANDA3DS_FRONTEND_QT - virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override { initGraphicsContextInternal(); } + void initGraphicsContext(GL::Context* context) override; #endif // Take a screenshot of the screen and store it in a file diff --git a/src/config.cpp b/src/config.cpp index dae5a0ab0..9cf6ef67b 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -60,9 +60,18 @@ void EmulatorConfig::load() { rendererType = RendererType::OpenGL; } + auto shaderModeName = toml::find_or(gpu, "ShaderMode", Renderer::shaderModeToString(defaultShaderMode)); + auto configShaderMode = Renderer::shaderModeFromString(shaderModeName); + + if (configShaderMode.has_value()) { + shaderMode = configShaderMode.value(); + } else { + Helpers::warn("Invalid shader mode specified: %s\n", shaderModeName.c_str()); + shaderMode = defaultShaderMode; + } + shaderJitEnabled = toml::find_or(gpu, "EnableShaderJIT", shaderJitDefault); vsyncEnabled = toml::find_or(gpu, "EnableVSync", true); - useUbershaders = toml::find_or(gpu, "UseUbershaders", ubershaderDefault); accurateShaderMul = toml::find_or(gpu, "AccurateShaderMultiplication", false); forceShadergenForLights = toml::find_or(gpu, "ForceShadergenForLighting", true); @@ -127,12 +136,12 @@ void EmulatorConfig::save() { data["General"]["EnableDiscordRPC"] = discordRpcEnabled; data["General"]["UsePortableBuild"] = usePortableBuild; data["General"]["DefaultRomPath"] = defaultRomPath.string(); - + data["GPU"]["EnableShaderJIT"] = shaderJitEnabled; data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType)); data["GPU"]["EnableVSync"] = vsyncEnabled; data["GPU"]["AccurateShaderMultiplication"] = accurateShaderMul; - data["GPU"]["UseUbershaders"] = useUbershaders; + data["GPU"]["ShaderMode"] = std::string(Renderer::shaderModeToString(shaderMode)); data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights; data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index fe336edc8..7d486b28b 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -117,7 +117,7 @@ void GPU::reset() { externalRegs[Framebuffer1Config] = static_cast(PICA::ColorFmt::RGB8); externalRegs[Framebuffer1Select] = 0; - renderer->setUbershaderSetting(config.useUbershaders); + renderer->setShaderMode(config.shaderMode); renderer->reset(); } @@ -365,7 +365,7 @@ PICA::Vertex GPU::getImmediateModeVertex() { // Run VS and return vertex data. TODO: Don't hardcode offsets for each attribute shaderUnit.vs.run(); - + // Map shader outputs to fixed function properties const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; for (int i = 0; i < totalShaderOutputs; i++) { diff --git a/src/core/renderer_gl/async_compiler.cpp b/src/core/renderer_gl/async_compiler.cpp new file mode 100644 index 000000000..d00d589bf --- /dev/null +++ b/src/core/renderer_gl/async_compiler.cpp @@ -0,0 +1,72 @@ +#include "renderer_gl/async_compiler.hpp" + +AsyncCompilerThread::AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata) : fragShaderGen(fragShaderGen) { + preallocatedProgramsIndex = 0; + running.store(true); + + for (int i = 0; i < preallocatedProgramsSize; i++) { + preallocatedPrograms[i] = new CompilingProgram(); + preallocatedPrograms[i]->config = new PICA::FragmentConfig({}); + } + + // The context needs to be created on the main thread so that we can make it shared with that + // thread's context + void* context = AsyncCompiler::createContext(userdata); + thread = std::thread([this, userdata, context]() { + AsyncCompiler::makeCurrent(userdata, context); + printf("Async compiler started, GL version: %s\n", glGetString(GL_VERSION)); + + std::string defaultShadergenVSSource = this->fragShaderGen.getDefaultVertexShader(); + defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex); + + while (running.load()) { + CompilingProgram* item; + while (programQueue.Pop(item)) { + OpenGL::Program& glProgram = item->program->program; + std::string fs = this->fragShaderGen.generate(*item->config); + OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); + glProgram.create({defaultShadergenVs, fragShader}); + item->program->compiling.store(false); + fragShader.free(); + } + + hasWork.store(false); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + AsyncCompiler::destroyContext(context); + }); +} + +AsyncCompilerThread::~AsyncCompilerThread() { + running.store(false); + thread.join(); + + for (int i = 0; i < preallocatedProgramsSize; i++) { + delete preallocatedPrograms[i]->config; + delete preallocatedPrograms[i]; + } +} + +void AsyncCompilerThread::PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram) { + CompilingProgram* newProgram = preallocatedPrograms[preallocatedProgramsIndex]; + newProgram->program = cachedProgram; + *newProgram->config = config; + preallocatedProgramsIndex = (preallocatedProgramsIndex + 1) % preallocatedProgramsSize; + bool pushed = programQueue.Push(newProgram); + + if (!pushed) { + Helpers::warn("AsyncCompilerThread: Queue full, spinning"); + + while (!pushed) { + pushed = programQueue.Push(newProgram); + } + } +} + +void AsyncCompilerThread::Finish() { + hasWork.store(true); + + // Wait for the compiler thread to finish any outstanding work + while (hasWork.load()) {} +} \ No newline at end of file diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index f8fc31e7d..d0fc33851 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -9,6 +9,7 @@ #include "PICA/pica_frag_uniforms.hpp" #include "PICA/gpu.hpp" #include "PICA/regs.hpp" +#include "renderer_gl/async_compiler.hpp" #include "math_util.hpp" CMRC_DECLARE(RendererGL); @@ -172,9 +173,23 @@ void RendererGL::initGraphicsContextInternal() { defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex); } -// The OpenGL renderer doesn't need to do anything with the GL context (For Qt frontend) or the SDL window (For SDL frontend) -// So we just call initGraphicsContextInternal for both -void RendererGL::initGraphicsContext([[maybe_unused]] SDL_Window* window) { initGraphicsContextInternal(); } +void RendererGL::initGraphicsContext(SDL_Window* context) { + if (shaderMode == ShaderMode::Hybrid) { + asyncCompiler = new AsyncCompilerThread(fragShaderGen, context); + } + + initGraphicsContextInternal(); +} + +#ifdef PANDA3DS_FRONTEND_QT +void RendererGL::initGraphicsContext(GL::Context* context) { + if (shaderMode == ShaderMode::Hybrid) { + asyncCompiler = new AsyncCompilerThread(fragShaderGen, context); + } + + initGraphicsContextInternal(); +} +#endif // Set up the OpenGL blending context to match the emulated PICA void RendererGL::setupBlending() { @@ -414,23 +429,38 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::Triangle, }; - bool usingUbershader = enableUbershader; - if (usingUbershader) { + if (shaderMode == ShaderMode::Ubershader) { const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen // This way we generate fewer shaders overall than with full shadergen, but don't tank performance if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { - usingUbershader = false; + OpenGL::Program& program = getSpecializedShader(); + gl.useProgram(program); + } else { + gl.useProgram(triangleProgram); } - } - - if (usingUbershader) { - gl.useProgram(triangleProgram); - } else { + } else if (shaderMode == ShaderMode::Specialized) { OpenGL::Program& program = getSpecializedShader(); gl.useProgram(program); + } else if (shaderMode == ShaderMode::Hybrid) { + PICA::FragmentConfig fsConfig(regs); + auto cachedProgram = shaderCache.find(fsConfig); + + if (cachedProgram == shaderCache.end()) { + CachedProgram& program = shaderCache[fsConfig]; + program.compiling.store(true); + asyncCompiler->PushFragmentConfig(fsConfig, &program); + gl.useProgram(triangleProgram); + } else if (cachedProgram->second.compiling.load(std::memory_order_relaxed)) { + gl.useProgram(triangleProgram); + } else { + OpenGL::Program& program = getSpecializedShader(); + gl.useProgram(program); + } + } else { + Helpers::panic("Invalid shader mode"); } const auto primitiveTopology = primTypes[static_cast(primType)]; @@ -458,7 +488,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v static constexpr std::array depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL}; // Update ubershader uniforms - if (usingUbershader) { + if (gl.currentProgram == triangleProgram.handle()) { const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; @@ -844,14 +874,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() { OpenGL::Program& program = programEntry.program; if (!program.exists()) { + if (shaderMode == ShaderMode::Hybrid) [[unlikely]] { + Helpers::panic("Compiling shaders in main thread, this should never happen"); + } + std::string fs = fragShaderGen.generate(fsConfig); OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); program.create({defaultShadergenVs, fragShader}); - gl.useProgram(program); fragShader.free(); + } + if (programEntry.needsInitialization) { + gl.useProgram(program); // Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3 glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0); glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); @@ -862,6 +898,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() { // As it's an OpenGL 4.2 feature that MacOS doesn't support... uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms"); glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding); + programEntry.needsInitialization = false; } glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO); @@ -979,6 +1016,11 @@ void RendererGL::screenshot(const std::string& name) { } void RendererGL::clearShaderCache() { + if (asyncCompiler != nullptr && shaderMode == ShaderMode::Hybrid) { + // May contain objects that are still in use, so we need to clear them first + asyncCompiler->Finish(); + } + for (auto& shader : shaderCache) { CachedProgram& cachedProgram = shader.second; cachedProgram.program.free(); diff --git a/src/hydra_core.cpp b/src/hydra_core.cpp index acbf30a8b..1b1ec0265 100644 --- a/src/hydra_core.cpp +++ b/src/hydra_core.cpp @@ -162,3 +162,12 @@ HC_API const char* getInfo(hydra::InfoType type) { default: return nullptr; } } + +namespace AsyncCompiler { + void* createContext(void* mainContext) { + return nullptr; + } + + void makeCurrent(void* mainContext, void* context) {} + void destroyContext(void* context) {} +} // namespace AsyncCompiler diff --git a/src/jni_driver.cpp b/src/jni_driver.cpp index e4ce2b399..bdd34470b 100644 --- a/src/jni_driver.cpp +++ b/src/jni_driver.cpp @@ -4,10 +4,10 @@ #include +#include "android_utils.hpp" #include "emulator.hpp" #include "renderer_gl/renderer_gl.hpp" #include "services/hid.hpp" -#include "android_utils.hpp" std::unique_ptr emulator = nullptr; HIDService* hidService = nullptr; @@ -40,17 +40,17 @@ JNIEnv* jniEnv() { extern "C" { #define MAKE_SETTING(functionName, type, settingName) \ -AlberFunction(void, functionName) (JNIEnv* env, jobject obj, type value) { emulator->getConfig().settingName = value; } + AlberFunction(void, functionName)(JNIEnv * env, jobject obj, type value) { emulator->getConfig().settingName = value; } MAKE_SETTING(setShaderJitEnabled, jboolean, shaderJitEnabled) #undef MAKE_SETTING AlberFunction(void, Setup)(JNIEnv* env, jobject obj) { - env->GetJavaVM(&jvm); + env->GetJavaVM(&jvm); - alberClass = (jclass)env->NewGlobalRef((jclass)env->FindClass("com/panda3ds/pandroid/AlberDriver")); - alberClassOpenDocument = env->GetStaticMethodID(alberClass, "openDocument", "(Ljava/lang/String;Ljava/lang/String;)I"); + alberClass = (jclass)env->NewGlobalRef((jclass)env->FindClass("com/panda3ds/pandroid/AlberDriver")); + alberClassOpenDocument = env->GetStaticMethodID(alberClass, "openDocument", "(Ljava/lang/String;Ljava/lang/String;)I"); } AlberFunction(void, Pause)(JNIEnv* env, jobject obj) { emulator->pause(); } @@ -128,15 +128,15 @@ AlberFunction(jbyteArray, GetSmdh)(JNIEnv* env, jobject obj) { #undef AlberFunction int AndroidUtils::openDocument(const char* path, const char* perms) { - auto env = jniEnv(); + auto env = jniEnv(); - jstring uri = env->NewStringUTF(path); - jstring jmode = env->NewStringUTF(perms); + jstring uri = env->NewStringUTF(path); + jstring jmode = env->NewStringUTF(perms); - jint result = env->CallStaticIntMethod(alberClass, alberClassOpenDocument, uri, jmode); + jint result = env->CallStaticIntMethod(alberClass, alberClassOpenDocument, uri, jmode); - env->DeleteLocalRef(uri); - env->DeleteLocalRef(jmode); + env->DeleteLocalRef(uri); + env->DeleteLocalRef(jmode); - return (int)result; -} \ No newline at end of file + return (int)result; +} diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp index b099067fc..b1571df0a 100644 --- a/src/libretro_core.cpp +++ b/src/libretro_core.cpp @@ -150,8 +150,8 @@ static void configInit() { static const retro_variable values[] = { {"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"}, {"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"}, - {"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled" - : "Use ubershaders (No stutter, maybe slower); disabled|enabled"}, + {"panda3ds_use_ubershader", EmulatorConfig::defaultShaderMode == ShaderMode::Ubershader ? "Use ubershaders (No stutter, maybe slower); enabled|disabled" + : "Use ubershaders (No stutter, maybe slower); disabled|enabled"}, {"panda3ds_use_vsync", "Enable VSync; enabled|disabled"}, {"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"}, {"panda3ds_use_audio", "Enable audio; disabled|enabled"}, @@ -180,7 +180,9 @@ static void configUpdate() { config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true); config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false); config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false); - config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", true); + config.shaderMode = FetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::defaultShaderMode == ShaderMode::Ubershader) + ? ShaderMode::Ubershader + : ShaderMode::Specialized; config.forceShadergenForLights = FetchVariableBool("panda3ds_ubershader_lighting_override", true); config.lightShadergenThreshold = std::clamp(std::stoi(FetchVariable("panda3ds_ubershader_lighting_override_threshold", "1")), 1, 8); config.discordRpcEnabled = false; @@ -403,3 +405,13 @@ void retro_cheat_set(uint index, bool enabled, const char* code) { void retro_cheat_reset() { emulator->getCheats().reset(); } + +namespace AsyncCompiler { + void* createContext(void* mainContext) { + return nullptr; + } + + void makeCurrent(void* mainContext, void* context) {} + + void destroyContext(void* context) {} +} // namespace AsyncCompiler diff --git a/src/panda_qt/main_window.cpp b/src/panda_qt/main_window.cpp index 65769116e..24303d795 100644 --- a/src/panda_qt/main_window.cpp +++ b/src/panda_qt/main_window.cpp @@ -6,8 +6,10 @@ #include #include #include +#include #include "cheats.hpp" +#include "gl/context.h" #include "input_mappings.hpp" #include "services/dsp.hpp" @@ -601,3 +603,32 @@ void MainWindow::pollControllers() { } } } + +namespace AsyncCompiler { + void* createContext(void* mainContext) { + GL::Context* glContext = (GL::Context*)mainContext; + + // Unlike the SDL function, this doesn't make it current so we don't + // need to call MakeCurrent on the mainContext + WindowInfo wi = glContext->GetWindowInfo(); + wi.type = WindowInfo::Type::Surfaceless; + + std::unique_ptr iLoveBeingForcedToUseRAII = glContext->CreateSharedContext(wi); + + if (!iLoveBeingForcedToUseRAII) { + Helpers::panic("Failed to create shared GL context"); + } + + return iLoveBeingForcedToUseRAII.release(); + } + + void makeCurrent(void* unused, void* context) { + GL::Context* glContext = (GL::Context*)context; + glContext->MakeCurrent(); + } + + void destroyContext(void* context) { + GL::Context* glContext = (GL::Context*)context; + delete glContext; + } +} // namespace AsyncCompiler \ No newline at end of file diff --git a/src/panda_sdl/frontend_sdl.cpp b/src/panda_sdl/frontend_sdl.cpp index 77b1f55fd..2e806fe20 100644 --- a/src/panda_sdl/frontend_sdl.cpp +++ b/src/panda_sdl/frontend_sdl.cpp @@ -35,6 +35,11 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, config.rendererType == RendererType::Software ? 3 : 4); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, config.rendererType == RendererType::Software ? 3 : 1); + + if (config.shaderMode == ShaderMode::Hybrid) { + SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1); + } + window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 400, 480, SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE); if (window == nullptr) { @@ -46,6 +51,16 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp Helpers::panic("OpenGL context creation failed: %s", SDL_GetError()); } + if (config.shaderMode == ShaderMode::Hybrid) { + // As per the wiki you should check the value after creating the context + // as it can differ from the requested value + int sharingEnabled; + SDL_GL_GetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, &sharingEnabled); + if (!sharingEnabled) { + Helpers::panic("OpenGL context sharing not enabled"); + } + } + if (!gladLoadGLLoader(reinterpret_cast(SDL_GL_GetProcAddress))) { Helpers::panic("OpenGL init failed"); } @@ -342,3 +357,28 @@ void FrontendSDL::run() { SDL_GL_SwapWindow(window); } } + +namespace AsyncCompiler { + void* createContext(void* window) { + SDL_Window* sdlWindow = static_cast(window); + + // SDL_GL_CreateContext also makes it the current context so we need to switch back after creation + SDL_GLContext currentContext = SDL_GL_GetCurrentContext(); + SDL_GLContext glContext = SDL_GL_CreateContext(sdlWindow); + + if (glContext == nullptr) { + Helpers::panic("OpenGL context creation failed: %s", SDL_GetError()); + } + + SDL_GL_MakeCurrent(sdlWindow, currentContext); + return glContext; + } + + void makeCurrent(void* window, void* context) { + SDL_GL_MakeCurrent((SDL_Window*)window, (SDL_GLContext)context); + } + + void destroyContext(void* context) { + SDL_GL_DeleteContext(static_cast(context)); + } +} \ No newline at end of file diff --git a/src/renderer.cpp b/src/renderer.cpp index 76c3e7a02..9399133d1 100644 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -36,4 +36,34 @@ const char* Renderer::typeToString(RendererType rendererType) { case RendererType::Software: return "software"; default: return "Invalid"; } +} + +std::optional Renderer::shaderModeFromString(std::string inString) { + // Transform to lower-case to make the setting case-insensitive + std::transform(inString.begin(), inString.end(), inString.begin(), [](unsigned char c) { return std::tolower(c); }); + + static const std::unordered_map map = { + {"specialized", ShaderMode::Specialized}, + {"special", ShaderMode::Specialized}, + {"ubershader", ShaderMode::Ubershader}, + {"uber", ShaderMode::Ubershader}, + {"hybrid", ShaderMode::Hybrid}, + {"threaded", ShaderMode::Hybrid}, + {"i hate opengl context creation", ShaderMode::Hybrid}, + }; + + if (auto search = map.find(inString); search != map.end()) { + return search->second; + } + + return std::nullopt; +} + +const char* Renderer::shaderModeToString(ShaderMode shaderMode) { + switch (shaderMode) { + case ShaderMode::Specialized: return "specialized"; + case ShaderMode::Ubershader: return "ubershader"; + case ShaderMode::Hybrid: return "hybrid"; + default: return "Invalid"; + } } \ No newline at end of file diff --git a/third_party/duckstation/duckstation_scoped_guard.h b/third_party/duckstation/duckstation_scoped_guard.h index 89f35d92f..c9e57ddd9 100644 --- a/third_party/duckstation/duckstation_scoped_guard.h +++ b/third_party/duckstation/duckstation_scoped_guard.h @@ -19,6 +19,14 @@ class ScopedGuard final /// Prevents the function from being invoked when we go out of scope. ALWAYS_INLINE void Cancel() { m_func.reset(); } + /// Runs the destructor function now instead of when we go out of scope. + ALWAYS_INLINE void Run() { + if (!m_func.has_value()) return; + + m_func.value()(); + m_func.reset(); + } + /// Explicitly fires the function. ALWAYS_INLINE void Invoke() { diff --git a/third_party/duckstation/gl/context.cpp b/third_party/duckstation/gl/context.cpp index 69401bd95..e06fc5359 100644 --- a/third_party/duckstation/gl/context.cpp +++ b/third_party/duckstation/gl/context.cpp @@ -74,14 +74,7 @@ std::unique_ptr Context::Create(const WindowInfo& wi, const Version context = ContextAGL::Create(wi, versions_to_try, num_versions_to_try); #else if (wi.type == WindowInfo::Type::X11) - { - const char* use_egl_x11 = std::getenv("USE_EGL_X11"); - if (use_egl_x11 && std::strcmp(use_egl_x11, "1") == 0) - context = ContextEGLX11::Create(wi, versions_to_try, num_versions_to_try); - else - context = ContextGLX::Create(wi, versions_to_try, num_versions_to_try); - } - + context = ContextEGLX11::Create(wi, versions_to_try, num_versions_to_try); #ifdef WAYLAND_ENABLED if (wi.type == WindowInfo::Type::Wayland) context = ContextEGLWayland::Create(wi, versions_to_try, num_versions_to_try); diff --git a/third_party/duckstation/gl/context_egl_x11.cpp b/third_party/duckstation/gl/context_egl_x11.cpp index 6db6c10b1..bb5e40f99 100644 --- a/third_party/duckstation/gl/context_egl_x11.cpp +++ b/third_party/duckstation/gl/context_egl_x11.cpp @@ -20,6 +20,7 @@ std::unique_ptr ContextEGLX11::CreateSharedContext(const WindowInfo& wi { std::unique_ptr context = std::make_unique(wi); context->m_display = m_display; + context->m_supports_surfaceless = m_supports_surfaceless; if (!context->CreateContextAndSurface(m_version, m_context, false)) return nullptr; diff --git a/third_party/duckstation/gl/context_wgl.cpp b/third_party/duckstation/gl/context_wgl.cpp index 47ec4b1e8..3837656b5 100644 --- a/third_party/duckstation/gl/context_wgl.cpp +++ b/third_party/duckstation/gl/context_wgl.cpp @@ -19,6 +19,17 @@ static void* GetProcAddressCallback(const char* name) } namespace GL { +static bool ReloadWGL(HDC dc) +{ + if (!gladLoadWGL(dc)) + { + Log_ErrorPrint("Loading GLAD WGL functions failed"); + return false; + } + + return true; +} + ContextWGL::ContextWGL(const WindowInfo& wi) : Context(wi) {} ContextWGL::~ContextWGL() @@ -149,8 +160,8 @@ std::unique_ptr ContextWGL::CreateSharedContext(const WindowInfo& wi) } else { - Log_ErrorPrint("PBuffer not implemented"); - return nullptr; + if (!context->CreatePBuffer()) + return nullptr; } if (m_version.profile == Profile::NoProfile) @@ -305,6 +316,32 @@ bool ContextWGL::CreatePBuffer() static constexpr const int pb_attribs[] = {0, 0}; + HGLRC temp_rc = nullptr; + ScopedGuard temp_rc_guard([&temp_rc, hdc]() { + if (temp_rc) + { + wglMakeCurrent(hdc, nullptr); + wglDeleteContext(temp_rc); + } + }); + + if (!GLAD_WGL_ARB_pbuffer) + { + // we're probably running completely surfaceless... need a temporary context. + temp_rc = wglCreateContext(hdc); + if (!temp_rc || !wglMakeCurrent(hdc, temp_rc)) + { + Log_ErrorPrint("Failed to create temporary context to load WGL for pbuffer."); + return false; + } + + if (!ReloadWGL(hdc) || !GLAD_WGL_ARB_pbuffer) + { + Log_ErrorPrint("Missing WGL_ARB_pbuffer"); + return false; + } + } + AssertMsg(m_pixel_format.has_value(), "Has pixel format for pbuffer"); HPBUFFERARB pbuffer = wglCreatePbufferARB(hdc, m_pixel_format.value(), 1, 1, pb_attribs); if (!pbuffer) @@ -326,6 +363,7 @@ bool ContextWGL::CreatePBuffer() m_dummy_dc = hdc; m_pbuffer = pbuffer; + temp_rc_guard.Run(); pbuffer_guard.Cancel(); hdc_guard.Cancel(); hwnd_guard.Cancel(); diff --git a/third_party/lockfree/LICENSE b/third_party/lockfree/LICENSE new file mode 100644 index 000000000..2cb6782f8 --- /dev/null +++ b/third_party/lockfree/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Djordje Nedic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/lockfree/lockfree/spsc/queue.hpp b/third_party/lockfree/lockfree/spsc/queue.hpp new file mode 100755 index 000000000..97a8dc3f3 --- /dev/null +++ b/third_party/lockfree/lockfree/spsc/queue.hpp @@ -0,0 +1,110 @@ +/************************************************************** + * @file queue.hpp + * @brief A queue implementation written in standard c++11 + * suitable for both low-end microcontrollers all the way + * to HPC machines. Lock-free for single consumer single + * producer scenarios. + **************************************************************/ + +/************************************************************** + * Copyright (c) 2023 Djordje Nedic + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software + * without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to + * whom the Software is furnished to do so, subject to the + * following conditions: + * + * The above copyright notice and this permission notice shall + * be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR + * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * This file is part of lockfree + * + * Author: Djordje Nedic + * Version: v2.0.9 + **************************************************************/ + +/************************** INCLUDE ***************************/ +#ifndef LOCKFREE_QUEUE_HPP +#define LOCKFREE_QUEUE_HPP + +#include +#include +#include + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#include +#endif + +namespace lockfree { +namespace spsc { +/*************************** TYPES ****************************/ + +template class Queue { + static_assert(std::is_trivial::value, "The type T must be trivial"); + static_assert(size > 2, "Buffer size must be bigger than 2"); + + /********************** PUBLIC METHODS ************************/ + public: + Queue(); + + /** + * @brief Adds an element into the queue. + * Should only be called from the producer thread. + * @param[in] element + * @retval Operation success + */ + bool Push(const T &element); + + /** + * @brief Removes an element from the queue. + * Should only be called from the consumer thread. + * @param[out] element + * @retval Operation success + */ + bool Pop(T &element); + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) + /** + * @brief Removes an element from the queue. + * Should only be called from the consumer thread. + * @retval Either the element or nothing + */ + std::optional PopOptional(); +#endif + + /********************** PRIVATE MEMBERS ***********************/ + private: + T _data[size]; /**< Data array */ +#if LOCKFREE_CACHE_COHERENT + alignas(LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _r; /**< Read index */ + alignas( + LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _w; /**< Write index */ +#else + std::atomic_size_t _r; /**< Read index */ + std::atomic_size_t _w; /**< Write index */ +#endif +}; + +} /* namespace spsc */ +} /* namespace lockfree */ + +/************************** INCLUDE ***************************/ + +/* Include the implementation */ +#include "queue_impl.hpp" + +#endif /* LOCKFREE_QUEUE_HPP */ diff --git a/third_party/lockfree/lockfree/spsc/queue_impl.hpp b/third_party/lockfree/lockfree/spsc/queue_impl.hpp new file mode 100644 index 000000000..43654c88e --- /dev/null +++ b/third_party/lockfree/lockfree/spsc/queue_impl.hpp @@ -0,0 +1,111 @@ +/************************************************************** + * @file queue_impl.hpp + * @brief A queue implementation written in standard c++11 + * suitable for both low-end microcontrollers all the way + * to HPC machines. Lock-free for single consumer single + * producer scenarios. + **************************************************************/ + +/************************************************************** + * Copyright (c) 2023 Djordje Nedic + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software + * without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to + * whom the Software is furnished to do so, subject to the + * following conditions: + * + * The above copyright notice and this permission notice shall + * be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR + * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * This file is part of lockfree + * + * Author: Djordje Nedic + * Version: v2.0.9 + **************************************************************/ + +namespace lockfree { +namespace spsc { +/********************** PUBLIC METHODS ************************/ + +template Queue::Queue() : _r(0U), _w(0U) {} + +template bool Queue::Push(const T &element) { + /* + The full check needs to be performed using the next write index not to + miss the case when the read index wrapped and write index is at the end + */ + const size_t w = _w.load(std::memory_order_relaxed); + size_t w_next = w + 1; + if (w_next == size) { + w_next = 0U; + } + + /* Full check */ + const size_t r = _r.load(std::memory_order_acquire); + if (w_next == r) { + return false; + } + + /* Place the element */ + _data[w] = element; + + /* Store the next write index */ + _w.store(w_next, std::memory_order_release); + return true; +} + +template bool Queue::Pop(T &element) { + /* Preload indexes with adequate memory ordering */ + size_t r = _r.load(std::memory_order_relaxed); + const size_t w = _w.load(std::memory_order_acquire); + + /* Empty check */ + if (r == w) { + return false; + } + + /* Remove the element */ + element = _data[r]; + + /* Increment the read index */ + r++; + if (r == size) { + r = 0U; + } + + /* Store the read index */ + _r.store(r, std::memory_order_release); + return true; +} + +/********************* std::optional API **********************/ +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +template +std::optional Queue::PopOptional() { + T element; + bool result = Pop(element); + + if (result) { + return element; + } else { + return {}; + } +} +#endif + +} /* namespace spsc */ +} /* namespace lockfree */