Skip to content

Commit

Permalink
Get shader color write masking going on all backends.
Browse files Browse the repository at this point in the history
  • Loading branch information
hrydgard committed Nov 8, 2020
1 parent 7632c12 commit 6310af2
Show file tree
Hide file tree
Showing 22 changed files with 323 additions and 163 deletions.
26 changes: 26 additions & 0 deletions Common/GPU/OpenGL/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,32 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
CHECK_GL_ERROR_IF_DEBUG();
break;
}
case GLRRenderCommand::UNIFORM4UI:
{
_dbg_assert_(curProgram);
int loc = c.uniform4.loc ? *c.uniform4.loc : -1;
if (c.uniform4.name) {
loc = curProgram->GetUniformLoc(c.uniform4.name);
}
if (loc >= 0) {
switch (c.uniform4.count) {
case 1:
glUniform1uiv(loc, 1, (GLuint *)&c.uniform4.v[0]);
break;
case 2:
glUniform2uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
case 3:
glUniform3uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
case 4:
glUniform4uiv(loc, 1, (GLuint *)c.uniform4.v);
break;
}
}
CHECK_GL_ERROR_IF_DEBUG();
break;
}
case GLRRenderCommand::UNIFORM4I:
{
_dbg_assert_(curProgram);
Expand Down
1 change: 1 addition & 0 deletions Common/GPU/OpenGL/GLQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ enum class GLRRenderCommand : uint8_t {
BLENDCOLOR,
LOGICOP,
UNIFORM4I,
UNIFORM4UI,
UNIFORM4F,
UNIFORMMATRIX,
TEXTURESAMPLER,
Expand Down
24 changes: 24 additions & 0 deletions Common/GPU/OpenGL/GLRenderManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,30 @@ class GLRenderManager {
curRenderStep_->commands.push_back(data);
}

void SetUniformUI(const GLint *loc, int count, const uint32_t *udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
_dbg_assert_(curProgram_);
#endif
GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
data.uniform4.loc = loc;
data.uniform4.count = count;
memcpy(data.uniform4.v, udata, sizeof(uint32_t) * count);
curRenderStep_->commands.push_back(data);
}

void SetUniformUI1(const GLint *loc, uint32_t udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
_dbg_assert_(curProgram_);
#endif
GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
data.uniform4.loc = loc;
data.uniform4.count = 1;
memcpy(data.uniform4.v, &udata, sizeof(udata));
curRenderStep_->commands.push_back(data);
}

void SetUniformF(const GLint *loc, int count, const float *udata) {
_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
#ifdef _DEBUG
Expand Down
1 change: 1 addition & 0 deletions Common/GPU/ShaderWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const char *hlsl_preamble_fs =
"#define vec3 float3\n"
"#define vec4 float4\n"
"#define uvec3 uint3\n"
"#define uvec4 uint4\n"
"#define ivec3 int3\n"
"#define ivec4 int4\n"
"#define mat4 float4x4\n"
Expand Down
3 changes: 2 additions & 1 deletion Common/GPU/Vulkan/thin3d_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1543,7 +1543,8 @@ void VKContext::BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPass
void VKContext::BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) {
VKFramebuffer *fb = (VKFramebuffer *)fbo;

// TODO: There are cases where this is okay, actually.
// TODO: There are cases where this is okay, actually. But requires layout transitions and stuff -
// we're not ready for this.
_assert_(fb != curFramebuffer_);

int aspect = 0;
Expand Down
94 changes: 77 additions & 17 deletions GPU/Common/FragmentShaderGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
bool doFlatShading = id.Bit(FS_BIT_FLATSHADE);
bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL);
bool bgraTexture = id.Bit(FS_BIT_BGRA_TEXTURE);
bool colorWriteMask = id.Bit(FS_BIT_COLOR_WRITEMASK);

if (colorWriteMask && !compat.bitwiseOps) {
*errorString = "Color Write Mask requires bitwise ops";
return false;
}

GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
Expand All @@ -104,7 +110,13 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
bool earlyFragmentTests = ((!enableAlphaTest && !enableColorTest) || testForceToZero) && !gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
bool useAdrenoBugWorkaround = id.Bit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL);

bool readFramebufferTex = replaceBlend == REPLACE_BLEND_COPY_FBO && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);
bool readFramebuffer = replaceBlend == REPLACE_BLEND_COPY_FBO || colorWriteMask;
bool readFramebufferTex = readFramebuffer && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH);

if (readFramebuffer && compat.shaderLanguage == HLSL_D3D9) {
*errorString = "Framebuffer read not yet supported in HLSL D3D9";
return false;
}

if (compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {
if (earlyFragmentTests) {
Expand Down Expand Up @@ -188,11 +200,9 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
} else {
WRITE(p, "SamplerState samp : register(s0);\n");
WRITE(p, "Texture2D<vec4> tex : register(t0);\n");
if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
if (replaceBlend == REPLACE_BLEND_COPY_FBO) {
// No sampler required, we Load
WRITE(p, "Texture2D<vec4> fboTex : register(t1);\n");
}
if (readFramebufferTex) {
// No sampler required, we Load
WRITE(p, "Texture2D<vec4> fboTex : register(t1);\n");
}
WRITE(p, "cbuffer base : register(b0) {\n%s};\n", ub_baseStr);
}
Expand All @@ -207,7 +217,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
}
if (enableColorTest) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return uvec3(floor(x * 255.0f + 0.5f)); }\n");
WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return (floor(x * 255.0f + 0.5f)); }\n");
} else {
WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
}
Expand All @@ -225,7 +235,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (enableFog) {
WRITE(p, " float v_fogdepth: TEXCOORD1;\n");
}
if (compat.shaderLanguage == HLSL_D3D11 && ((replaceBlend == REPLACE_BLEND_COPY_FBO) || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT))) {
if (compat.shaderLanguage == HLSL_D3D11 && readFramebuffer) {
WRITE(p, " vec4 pixelPos : SV_POSITION;\n");
}
WRITE(p, "};\n");
Expand Down Expand Up @@ -286,14 +296,15 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (doTexture)
WRITE(p, "uniform sampler2D tex;\n");

if (readFramebufferTex) {
if (!compat.texelFetch) {
WRITE(p, "uniform vec2 u_fbotexSize;\n");
}
WRITE(p, "uniform sampler2D fbotex;\n");
}

if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
*uniformMask |= DIRTY_SHADERBLEND;
if (readFramebufferTex) {
if (!compat.texelFetch) {
WRITE(p, "uniform vec2 u_fbotexSize;\n");
}
WRITE(p, "uniform sampler2D fbotex;\n");
}
if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
WRITE(p, "uniform vec3 u_blendFixA;\n");
}
Expand Down Expand Up @@ -329,6 +340,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
*uniformMask |= DIRTY_DEPAL;
}

if (colorWriteMask) {
WRITE(p, "uniform uint u_colorWriteMask;\n");
*uniformMask |= DIRTY_COLORWRITEMASK;
}

if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
*uniformMask |= DIRTY_STENCILREPLACEVALUE;
WRITE(p, "uniform float u_stencilReplaceValue;\n");
Expand Down Expand Up @@ -387,6 +403,20 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu

}

// Provide implementations of packUnorm4x8 and unpackUnorm4x8 if not available.
if (colorWriteMask && compat.shaderLanguage == HLSL_D3D11 || (compat.shaderLanguage == GLSL_3xx && compat.glslVersionNumber < 400)) {
WRITE(p, "uint packUnorm4x8(vec4 v) {\n");
WRITE(p, " v = clamp(v, 0.0, 1.0);\n");
WRITE(p, " uvec4 u = uvec4(255.0 * v);\n");
WRITE(p, " return u.x | (u.y << 8) | (u.z << 16) | (u.w << 24);\n");
WRITE(p, "}\n");

WRITE(p, "vec4 unpackUnorm4x8(uint x) {\n");
WRITE(p, " uvec4 u = uvec4(x & 0xFFU, (x >> 8) & 0xFFU, (x >> 16) & 0xFFU, (x >> 24) & 0xFFU);\n");
WRITE(p, " return vec4(u) * (1.0 / 255.0);\n");
WRITE(p, "}\n");
}

// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
Expand Down Expand Up @@ -416,6 +446,21 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
if (isModeClear) {
// Clear mode does not allow any fancy shading.
WRITE(p, " vec4 v = v_color0;\n");

// Masking with clear mode is ok, I think?

This comment has been minimized.

Copy link
@unknownbrackets

unknownbrackets Dec 3, 2020

Collaborator

Correct, it's applied as per tests. Not logic ops, though.

-[Unknown]

if (readFramebuffer) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, " vec4 destColor = fboTex.Load(int3((int)In.pixelPos.x, (int)In.pixelPos.y, 0));\n");
} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData);
} else if (!compat.texelFetch) {
WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
} else {
WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
}
}
} else {
const char *secondary = "";
// Secondary color for specular on top of texture
Expand Down Expand Up @@ -799,19 +844,22 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
WRITE(p, " v.rgb = v.rgb * %s;\n", srcFactor);
}

if (replaceBlend == REPLACE_BLEND_COPY_FBO && compat.shaderLanguage != HLSL_D3D9) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
// Two things read from the old framebuffer - shader replacement blending and bit-level masking.
if (readFramebuffer) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, " vec4 destColor = fboTex.Load(int3((int)In.pixelPos.x, (int)In.pixelPos.y, 0));\n");
} else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) {
// If we have NV_shader_framebuffer_fetch / EXT_shader_framebuffer_fetch, we skip the blit.
// We can just read the prev value more directly.
WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData);
} else if (!compat.texelFetch) {
WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
} else {
WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
}
}

if (replaceBlend == REPLACE_BLEND_COPY_FBO) {
const char *srcFactor = nullptr;
const char *dstFactor = nullptr;

Expand Down Expand Up @@ -927,6 +975,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
return false;
}

// TODO: This could support more ops using the shader blending mechanism.
LogicOpReplaceType replaceLogicOpType = (LogicOpReplaceType)id.Bits(FS_BIT_REPLACE_LOGIC_OP_TYPE, 2);
switch (replaceLogicOpType) {
case LOGICOPTYPE_ONE:
Expand All @@ -943,6 +992,17 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
return false;
}

// Final color computed - apply color write mask.
// TODO: Maybe optimize to only do math on the affected channels?
// Or .. meh.
if (colorWriteMask) {
WRITE(p, " highp uint v32 = packUnorm4x8(v);\n");
WRITE(p, " highp uint d32 = packUnorm4x8(destColor);\n");
// Note that the mask has been flipped to the PC way - 1 means write.
WRITE(p, " v32 = (v32 & u_colorWriteMask) | (d32 & ~u_colorWriteMask);\n");
WRITE(p, " v = unpackUnorm4x8(v32);\n");
}

if (gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
const double scale = DepthSliceFactor() * 65535.0;

Expand Down
62 changes: 61 additions & 1 deletion GPU/Common/GPUStateUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -956,8 +956,68 @@ void ApplyStencilReplaceAndLogicOpIgnoreBlend(ReplaceAlphaType replaceAlphaWithS
}
}

// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
bool IsColorWriteMaskComplex(bool allowFramebufferRead) {
// Restrict to Outrun temporarily (by uglily reusing the ReinterpretFramebuffers flag)
if (!allowFramebufferRead || !PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) {
// Don't have a choice - we'll make do but it won't always be right.
return false;
}

uint32_t colorMask = (gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24);

for (int i = 0; i < 4; i++) {
switch (colorMask & 0xFF) {
case 0x0:
case 0xFF:
break;
default:
return true;
}
colorMask >>= 8;
}
return false;
}

// If we can we emulate the colorMask by simply toggling the full R G B A masks offered
// by modern hardware, we do that. This is 99.9% of the time.
// When that's not enough, we fall back on a technique similar to shader blending,
// we read from the framebuffer (or a copy of it).
void ConvertMaskState(GenericMaskState &maskState, bool allowFramebufferRead) {
// Invert to convert masks from the PSP's format where 1 is don't draw to PC where 1 is draw.
uint32_t colorMask = ~((gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24));

maskState.applyFramebufferRead = false;
for (int i = 0; i < 4; i++) {
int channelMask = colorMask & 0xFF;
switch (channelMask) {
case 0x0:
maskState.rgba[i] = false;
break;
case 0xFF:
maskState.rgba[i] = true;
break;
default:
if (allowFramebufferRead) {
maskState.applyFramebufferRead = true;
maskState.rgba[i] = true;
} else {
// Use the old heuristic.
maskState.rgba[i] = channelMask >= 128;
}
}
colorMask >>= 8;
}

// Let's not write to alpha if stencil isn't enabled.
if (IsStencilTestOutputDisabled()) {
maskState.rgba[3] = false;
} else if (ReplaceAlphaWithStencilType() == STENCIL_VALUE_KEEP) {
// If the stencil type is set to KEEP, we shouldn't write to the stencil/alpha channel.
maskState.rgba[3] = false;
}
}

// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
void ConvertBlendState(GenericBlendState &blendState, bool allowFramebufferRead) {
// Blending is a bit complex to emulate. This is due to several reasons:
//
Expand Down
Loading

0 comments on commit 6310af2

Please sign in to comment.