Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix car reflections in Outrun by implementing per-bit color masking #13640

Merged
merged 1 commit into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Common/GPU/OpenGL/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,32 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
CHECK_GL_ERROR_IF_DEBUG();
break;
}
case GLRRenderCommand::UNIFORM4UI:
{
_dbg_assert_(curProgram);
int loc = c.uniform4.loc ? *c.uniform4.loc : -1;
if (c.uniform4.name) {
loc = curProgram->GetUniformLoc(c.uniform4.name);
}
if (loc >= 0) {
switch (c.uniform4.count) {
case 1:
glUniform1uiv(loc, 1, (GLuint *)&c.uniform4.v[0]);
break;
case 2:
glUniform2uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
case 3:
glUniform3uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
case 4:
glUniform4uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
}
}
CHECK_GL_ERROR_IF_DEBUG();
break;
}
case GLRRenderCommand::UNIFORM4I:
{
_dbg_assert_(curProgram);
Expand Down
1 change: 1 addition & 0 deletions Common/GPU/OpenGL/GLQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ enum class GLRRenderCommand : uint8_t {
BLENDCOLOR,
LOGICOP,
UNIFORM4I,
UNIFORM4UI,
UNIFORM4F,
UNIFORMMATRIX,
TEXTURESAMPLER,
Expand Down
24 changes: 24 additions & 0 deletions Common/GPU/OpenGL/GLRenderManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,30 @@ class GLRenderManager {
curRenderStep_->commands.push_back(data);
}

void SetUniformUI(const GLint *loc, int count, const uint32_t *udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
_dbg_assert_(curProgram_);
#endif
GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
data.uniform4.loc = loc;
data.uniform4.count = count;
memcpy(data.uniform4.v, udata, sizeof(uint32_t) * count);
curRenderStep_->commands.push_back(data);
}

void SetUniformUI1(const GLint *loc, uint32_t udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
_dbg_assert_(curProgram_);
#endif
GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
data.uniform4.loc = loc;
data.uniform4.count = 1;
memcpy(data.uniform4.v, &udata, sizeof(udata));
curRenderStep_->commands.push_back(data);
}

void SetUniformF(const GLint *loc, int count, const float *udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
Expand Down
1 change: 1 addition & 0 deletions Common/GPU/ShaderWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const char *hlsl_preamble_fs =
"#define vec3 float3\n"
"#define vec4 float4\n"
"#define uvec3 uint3\n"
"#define uvec4 uint4\n"
"#define ivec3 int3\n"
"#define ivec4 int4\n"
"#define mat4 float4x4\n"
Expand Down
3 changes: 2 additions & 1 deletion Common/GPU/Vulkan/thin3d_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1543,7 +1543,8 @@ void VKContext::BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPass
void VKContext::BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) {
VKFramebuffer *fb = (VKFramebuffer *)fbo;

// TODO: There are cases where this is okay, actually.
// TODO: There are cases where this is okay, actually. But requires layout transitions and stuff -
// we're not ready for this.
_assert_(fb != curFramebuffer_);

int aspect = 0;
Expand Down
94 changes: 77 additions & 17 deletions GPU/Common/FragmentShaderGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
bool doFlatShading = id.Bit(FS_BIT_FLATSHADE);
bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL);
bool bgraTexture = id.Bit(FS_BIT_BGRA_TEXTURE);
bool colorWriteMask = id.Bit(FS_BIT_COLOR_WRITEMASK);

if (colorWriteMask && !compat.bitwiseOps) {
*errorString = "Color Write Mask requires bitwise ops";
return false;
}

GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
Expand All @@ -104,7 +110,13 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
bool earlyFragmentTests = ((!enableAlphaTest && !enableColorTest) || testForceToZero) && !gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
bool useAdrenoBugWorkaround = id.Bit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL);

bool readFramebufferTex = replaceBlend == REPLACE_BLEND_COPY_FBO && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
bool readFramebuffer = replaceBlend == REPLACE_BLEND_COPY_FBO || colorWriteMask;
bool readFramebufferTex = readFramebuffer && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);

if (readFramebuffer && compat.shaderLanguage == HLSL_D3D9) {
*errorString = "Framebuffer read not yet supported in HLSL D3D9";
return false;
}

if (compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {
if (earlyFragmentTests) {
Expand Down Expand Up @@ -188,11 +200,9 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
} else {
WRITE(p, "SamplerState samp : register(s0);\n");
WRITE(p, "Texture2D<vec4> tex : register(t0);\n");
if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
if (replaceBlend == REPLACE_BLEND_COPY_FBO) {
// No sampler required, we Load
WRITE(p, "Texture2D<vec4> fboTex : register(t1);\n");
}
if (readFramebufferTex) {
// No sampler required, we Load
WRITE(p, "Texture2D<vec4> fboTex : register(t1);\n");
}
WRITE(p, "cbuffer base : register(b0) {\n%s};\n", ub_baseStr);
}
Expand All @@ -207,7 +217,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
}
if (enableColorTest) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return uvec3(floor(x * 255.0f + 0.5f)); }\n");
WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return (floor(x * 255.0f + 0.5f)); }\n");
} else {
WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
}
Expand All @@ -225,7 +235,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (enableFog) {
WRITE(p, " float v_fogdepth: TEXCOORD1;\n");
}
if (compat.shaderLanguage == HLSL_D3D11 && ((replaceBlend == REPLACE_BLEND_COPY_FBO) || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT))) {
if (compat.shaderLanguage == HLSL_D3D11 && readFramebuffer) {
WRITE(p, " vec4 pixelPos : SV_POSITION;\n");
}
WRITE(p, "};\n");
Expand Down Expand Up @@ -286,14 +296,15 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (doTexture)
WRITE(p, "uniform sampler2D tex;\n");

if (readFramebufferTex) {
if (!compat.texelFetch) {
WRITE(p, "uniform vec2 u_fbotexSize;\n");
}
WRITE(p, "uniform sampler2D fbotex;\n");
}

if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
*uniformMask |= DIRTY_SHADERBLEND;
if (readFramebufferTex) {
if (!compat.texelFetch) {
WRITE(p, "uniform vec2 u_fbotexSize;\n");
}
WRITE(p, "uniform sampler2D fbotex;\n");
}
if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
WRITE(p, "uniform vec3 u_blendFixA;\n");
}
Expand Down Expand Up @@ -329,6 +340,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
*uniformMask |= DIRTY_DEPAL;
}

if (colorWriteMask) {
WRITE(p, "uniform uint u_colorWriteMask;\n");
*uniformMask |= DIRTY_COLORWRITEMASK;
}

if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
*uniformMask |= DIRTY_STENCILREPLACEVALUE;
WRITE(p, "uniform float u_stencilReplaceValue;\n");
Expand Down Expand Up @@ -387,6 +403,20 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu

}

// Provide implementations of packUnorm4x8 and unpackUnorm4x8 if not available.
if (colorWriteMask && compat.shaderLanguage == HLSL_D3D11 || (compat.shaderLanguage == GLSL_3xx && compat.glslVersionNumber < 400)) {
WRITE(p, "uint packUnorm4x8(vec4 v) {\n");
WRITE(p, " v = clamp(v, 0.0, 1.0);\n");
WRITE(p, " uvec4 u = uvec4(255.0 * v);\n");
WRITE(p, " return u.x | (u.y << 8) | (u.z << 16) | (u.w << 24);\n");
WRITE(p, "}\n");

WRITE(p, "vec4 unpackUnorm4x8(uint x) {\n");
WRITE(p, " uvec4 u = uvec4(x & 0xFFU, (x >> 8) & 0xFFU, (x >> 16) & 0xFFU, (x >> 24) & 0xFFU);\n");
WRITE(p, " return vec4(u) * (1.0 / 255.0);\n");
WRITE(p, "}\n");
}

// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
Expand Down Expand Up @@ -416,6 +446,21 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (isModeClear) {
// Clear mode does not allow any fancy shading.
WRITE(p, " vec4 v = v_color0;\n");

// Masking with clear mode is ok, I think?
if (readFramebuffer) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, " vec4 destColor = fboTex.Load(int3((int)In.pixelPos.x, (int)In.pixelPos.y, 0));\n");
} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData);
} else if (!compat.texelFetch) {
WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
} else {
WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
}
}
} else {
const char *secondary = "";
// Secondary color for specular on top of texture
Expand Down Expand Up @@ -799,19 +844,22 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
WRITE(p, " v.rgb = v.rgb * %s;\n", srcFactor);
}

if (replaceBlend == REPLACE_BLEND_COPY_FBO && compat.shaderLanguage != HLSL_D3D9) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
// Two things read from the old framebuffer - shader replacement blending and bit-level masking.
if (readFramebuffer) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, " vec4 destColor = fboTex.Load(int3((int)In.pixelPos.x, (int)In.pixelPos.y, 0));\n");
} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData);
} else if (!compat.texelFetch) {
WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
} else {
WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
}
}

if (replaceBlend == REPLACE_BLEND_COPY_FBO) {
const char *srcFactor = nullptr;
const char *dstFactor = nullptr;

Expand Down Expand Up @@ -927,6 +975,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
return false;
}

// TODO: This could support more ops using the shader blending mechanism.
LogicOpReplaceType replaceLogicOpType = (LogicOpReplaceType)id.Bits(FS_BIT_REPLACE_LOGIC_OP_TYPE, 2);
switch (replaceLogicOpType) {
case LOGICOPTYPE_ONE:
Expand All @@ -943,6 +992,17 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
return false;
}

// Final color computed - apply color write mask.
// TODO: Maybe optimize to only do math on the affected channels?
// Or .. meh.
if (colorWriteMask) {
WRITE(p, " highp uint v32 = packUnorm4x8(v);\n");
WRITE(p, " highp uint d32 = packUnorm4x8(destColor);\n");
// Note that the mask has been flipped to the PC way - 1 means write.
WRITE(p, " v32 = (v32 & u_colorWriteMask) | (d32 & ~u_colorWriteMask);\n");
WRITE(p, " v = unpackUnorm4x8(v32);\n");
}

if (gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
const double scale = DepthSliceFactor() * 65535.0;

Expand Down
62 changes: 61 additions & 1 deletion GPU/Common/GPUStateUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -956,8 +956,68 @@ void ApplyStencilReplaceAndLogicOpIgnoreBlend(ReplaceAlphaType replaceAlphaWithS
}
}

// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
bool IsColorWriteMaskComplex(bool allowFramebufferRead) {
// Restrict to Outrun temporarily (by uglily reusing the ReinterpretFramebuffers flag)
if (!allowFramebufferRead || !PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) {
// Don't have a choice - we'll make do but it won't always be right.
return false;
}

uint32_t colorMask = (gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24);

for (int i = 0; i < 4; i++) {
switch (colorMask & 0xFF) {
case 0x0:
case 0xFF:
break;
default:
return true;
}
colorMask >>= 8;
}
return false;
}

// If we can we emulate the colorMask by simply toggling the full R G B A masks offered
// by modern hardware, we do that. This is 99.9% of the time.
// When that's not enough, we fall back on a technique similar to shader blending,
// we read from the framebuffer (or a copy of it).
void ConvertMaskState(GenericMaskState &maskState, bool allowFramebufferRead) {
// Invert to convert masks from the PSP's format where 1 is don't draw to PC where 1 is draw.
uint32_t colorMask = ~((gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24));

maskState.applyFramebufferRead = false;
for (int i = 0; i < 4; i++) {
int channelMask = colorMask & 0xFF;
switch (channelMask) {
case 0x0:
maskState.rgba[i] = false;
break;
case 0xFF:
maskState.rgba[i] = true;
break;
default:
if (allowFramebufferRead) {
maskState.applyFramebufferRead = true;
maskState.rgba[i] = true;
} else {
// Use the old heuristic.
maskState.rgba[i] = channelMask >= 128;
}
}
colorMask >>= 8;
}

// Let's not write to alpha if stencil isn't enabled.
if (IsStencilTestOutputDisabled()) {
maskState.rgba[3] = false;
} else if (ReplaceAlphaWithStencilType() == STENCIL_VALUE_KEEP) {
// If the stencil type is set to KEEP, we shouldn't write to the stencil/alpha channel.
maskState.rgba[3] = false;
}
}

// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
void ConvertBlendState(GenericBlendState &blendState, bool allowFramebufferRead) {
// Blending is a bit complex to emulate. This is due to several reasons:
//
Expand Down
Loading