Skip to content

Commit

Permalink
Implement shader depal for GL as well, but disabled by default.
Browse files Browse the repository at this point in the history
  • Loading branch information
hrydgard committed Apr 13, 2018
1 parent 0479255 commit fb7a63b
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 7 deletions.
94 changes: 93 additions & 1 deletion GPU/GLES/FragmentShaderGeneratorGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform
bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ);
bool doTextureAlpha = id.Bit(FS_BIT_TEXALPHA);
bool doFlatShading = id.Bit(FS_BIT_FLATSHADE);
bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL);

GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
Expand Down Expand Up @@ -217,6 +218,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform
}
}

if (shaderDepal) {
WRITE(p, "uniform sampler2D pal;\n");
WRITE(p, "uniform int u_depal;\n");
*uniformMask |= DIRTY_DEPAL;
}

StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4);
if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
*uniformMask |= DIRTY_STENCILREPLACEVALUE;
Expand Down Expand Up @@ -336,10 +343,95 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform

if (doTextureProjection) {
WRITE(p, " vec4 t = %sProj(tex, %s);\n", texture, texcoord);
if (shaderDepal) {
WRITE(p, " vec4 t1 = %sProjOffset(tex, %s, ivec2(1, 0));\n", texture, texcoord);
WRITE(p, " vec4 t2 = %sProjOffset(tex, %s, ivec2(0, 1));\n", texture, texcoord);
WRITE(p, " vec4 t3 = %sProjOffset(tex, %s, ivec2(1, 1));\n", texture, texcoord);
}
} else {
WRITE(p, " vec4 t = %s(tex, %s.xy);\n", texture, texcoord);
if (shaderDepal) {
WRITE(p, " vec4 t1 = %sOffset(tex, %s.xy, ivec2(1, 0));\n", texture, texcoord);
WRITE(p, " vec4 t2 = %sOffset(tex, %s.xy, ivec2(0, 1));\n", texture, texcoord);
WRITE(p, " vec4 t3 = %sOffset(tex, %s.xy, ivec2(1, 1));\n", texture, texcoord);
}
}
WRITE(p, " vec4 p = v_color0;\n");

if (shaderDepal) {
WRITE(p, " int depalMask = (u_depal & 0xFF);\n");
WRITE(p, " int depalShift = ((u_depal >> 8) & 0xFF);\n");
WRITE(p, " int depalOffset = (((u_depal >> 16) & 0xFF) << 4);\n");
WRITE(p, " int depalFmt = ((u_depal >> 24) & 0x3);\n");
WRITE(p, " bool bilinear = (u_depal >> 31) != 0;\n");
WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord);
WRITE(p, " ivec4 col; int index0; int index1; int index2; int index3;\n");
WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue.
WRITE(p, " case 0:\n"); // 565
WRITE(p, " col = ivec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
WRITE(p, " index0 = (col.b << 11) | (col.g << 5) | (col.r);\n");
WRITE(p, " if (bilinear) {\n");
WRITE(p, " col = ivec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
WRITE(p, " index1 = (col.b << 11) | (col.g << 5) | (col.r);\n");
WRITE(p, " col = ivec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
WRITE(p, " index2 = (col.b << 11) | (col.g << 5) | (col.r);\n");
WRITE(p, " col = ivec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
WRITE(p, " index3 = (col.b << 11) | (col.g << 5) | (col.r);\n");
WRITE(p, " }\n");
WRITE(p, " break;\n");
WRITE(p, " case 1:\n"); // 5551
WRITE(p, " col = ivec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
WRITE(p, " index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
WRITE(p, " if (bilinear) {\n");
WRITE(p, " col = ivec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
WRITE(p, " index1 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
WRITE(p, " col = ivec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
WRITE(p, " index2 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
WRITE(p, " col = ivec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
WRITE(p, " index3 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n");
WRITE(p, " }\n");
WRITE(p, " break;\n");
WRITE(p, " case 2:\n"); // 4444
WRITE(p, " col = ivec4(t.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n");
WRITE(p, " index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
WRITE(p, " if (bilinear) {\n");
WRITE(p, " col = ivec4(t1.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n");
WRITE(p, " index1 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
WRITE(p, " col = ivec4(t2.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n");
WRITE(p, " index2 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
WRITE(p, " col = ivec4(t3.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n");
WRITE(p, " index3 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n");
WRITE(p, " }\n");
WRITE(p, " break;\n");
WRITE(p, " case 3:\n"); // 8888
WRITE(p, " col = ivec4(t.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n");
WRITE(p, " index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
WRITE(p, " if (bilinear) {\n");
WRITE(p, " col = ivec4(t1.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n");
WRITE(p, " index1 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
WRITE(p, " col = ivec4(t2.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n");
WRITE(p, " index2 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
WRITE(p, " col = ivec4(t3.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n");
WRITE(p, " index3 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n");
WRITE(p, " }\n");
WRITE(p, " break;\n");
WRITE(p, " };\n");
WRITE(p, " index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n");
WRITE(p, " t = texelFetch(pal, ivec2(index0, 0), 0);\n");
WRITE(p, " if (bilinear) {\n");
WRITE(p, " index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n");
WRITE(p, " index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n");
WRITE(p, " index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n");
WRITE(p, " t1 = texelFetch(pal, ivec2(index1, 0), 0);\n");
WRITE(p, " t2 = texelFetch(pal, ivec2(index2, 0), 0);\n");
WRITE(p, " t3 = texelFetch(pal, ivec2(index3, 0), 0);\n");
WRITE(p, " t = mix(t, t1, fraction.x);\n");
WRITE(p, " t2 = mix(t2, t3, fraction.x);\n");
WRITE(p, " t = mix(t, t2, fraction.y);\n");
WRITE(p, " }\n");
}

if (texFunc != GE_TEXFUNC_REPLACE || !doTextureAlpha)
WRITE(p, " vec4 p = v_color0;\n");

if (doTextureAlpha) { // texfmt == RGBA
switch (texFunc) {
Expand Down
16 changes: 15 additions & 1 deletion GPU/GLES/ShaderManagerGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
queries.push_back({ &u_blendFixA, "u_blendFixA" });
queries.push_back({ &u_blendFixB, "u_blendFixB" });
queries.push_back({ &u_fbotexSize, "u_fbotexSize" });
queries.push_back({ &u_pal, "pal" });

// Transform
queries.push_back({ &u_view, "u_view" });
Expand Down Expand Up @@ -161,6 +162,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
queries.push_back({ &u_spline_count_v, "u_spline_count_v" });
queries.push_back({ &u_spline_type_u, "u_spline_type_u" });
queries.push_back({ &u_spline_type_v, "u_spline_type_v" });
queries.push_back({ &u_depal, "u_depal" });

attrMask = vs->GetAttrMask();
availableUniforms = vs->GetUniformMask() | fs->GetUniformMask();
Expand All @@ -169,6 +171,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
initialize.push_back({ &u_tex, 0, 0 });
initialize.push_back({ &u_fbotex, 0, 1 });
initialize.push_back({ &u_testtex, 0, 2 });
initialize.push_back({ &u_pal, 0, 3 }); // CLUT
initialize.push_back({ &u_tess_pos_tex, 0, 4 }); // Texture unit 4
initialize.push_back({ &u_tess_tex_tex, 0, 5 }); // Texture unit 5
initialize.push_back({ &u_tess_col_tex, 0, 6 }); // Texture unit 6
Expand Down Expand Up @@ -283,6 +286,17 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
if (!dirty)
return;

if (dirty & DIRTY_DEPAL) {
int indexMask = gstate.getClutIndexMask();
int indexShift = gstate.getClutIndexShift();
int indexOffset = gstate.getClutIndexStartPos() >> 4;
int format = gstate_c.depalFramebufferFormat;
uint32_t val = BytesToUint32(indexMask, indexShift, indexOffset, format);
// Poke in a bilinear filter flag in the top bit.
val |= gstate.isMagnifyFilteringEnabled() << 31;
render_->SetUniformI1(&u_depal, val);
}

// Update any dirty uniforms before we draw
if (dirty & DIRTY_PROJMATRIX) {
Matrix4x4 flippedMatrix;
Expand Down Expand Up @@ -810,7 +824,7 @@ std::string ShaderManagerGLES::DebugGetShaderString(std::string id, DebugShaderT
// as sometimes these features might have an effect on the ID bits.

#define CACHE_HEADER_MAGIC 0x83277592
#define CACHE_VERSION 11
#define CACHE_VERSION 12
struct CacheHeader {
uint32_t magic;
uint32_t version;
Expand Down
4 changes: 4 additions & 0 deletions GPU/GLES/ShaderManagerGLES.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ class LinkedShader {
int u_blendFixB;
int u_fbotexSize;

// Shader depal
int u_pal; // the texture
int u_depal; // the params

// Fragment processing inputs
int u_alphacolorref;
int u_alphacolormask;
Expand Down
35 changes: 33 additions & 2 deletions GPU/GLES/TextureCacheGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void TextureCacheGLES::UpdateSamplingParams(TexCacheEntry &entry, bool force) {
render_->SetTextureSampler(0, sClamp ? GL_CLAMP_TO_EDGE : GL_REPEAT, tClamp ? GL_CLAMP_TO_EDGE : GL_REPEAT, MagFiltGL[magFilt], MinFiltGL[minFilt], aniso);
}

void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight) {
void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, bool forcePoint) {
int minFilt;
int magFilt;
bool sClamp;
Expand All @@ -171,6 +171,10 @@ void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferH
GetSamplingParams(minFilt, magFilt, sClamp, tClamp, lodBias, 0, 0, mode);

minFilt &= 1; // framebuffers can't mipmap.
if (forcePoint) {
minFilt &= ~1;
magFilt &= ~1;
}

// Often the framebuffer will not match the texture size. We'll wrap/clamp in the shader in that case.
// This happens whether we have OES_texture_npot or not.
Expand Down Expand Up @@ -324,6 +328,7 @@ void TextureCacheGLES::BindTexture(TexCacheEntry *entry) {
lastBoundTexture = entry->textureName;
}
UpdateSamplingParams(*entry, false);
gstate_c.useShaderDepal = false;
}

void TextureCacheGLES::Unbind() {
Expand Down Expand Up @@ -434,7 +439,33 @@ class TextureShaderApplier {
void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) {
DepalShader *depal = nullptr;
uint32_t clutMode = gstate.clutformat & 0xFFFFFF;

#if 0
bool useShaderDepal = gstate_c.Supports(GPU_SUPPORTS_GLSL_ES_300);
#else
bool useShaderDepal = false;
#endif

if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) {
if (useShaderDepal) {
const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
GLRTexture *clutTexture = depalShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBuf_);
render_->BindTexture(TEX_SLOT_CLUT, clutTexture);
framebufferManagerGL_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET);
SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, true);
InvalidateLastTexture();

// Since we started/ended render passes, might need these.
gstate_c.Dirty(DIRTY_DEPAL);
gstate_c.useShaderDepal = true;
gstate_c.depalFramebufferFormat = framebuffer->drawnFormat;
const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16);
const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor;
TexCacheEntry::TexStatus alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors, clutTotalColors, 1);
gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL);
return;
}

depal = depalShaderCache_->GetDepalettizeShader(clutMode, framebuffer->drawnFormat);
}
if (depal) {
Expand Down Expand Up @@ -472,7 +503,7 @@ void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFram
}

framebufferManagerGL_->RebindFramebuffer();
SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight);
SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, false);

InvalidateLastTexture();

Expand Down
2 changes: 1 addition & 1 deletion GPU/GLES/TextureCacheGLES.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class TextureCacheGLES : public TextureCacheCommon {
}
}

void SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight);
void SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, bool forcePoint);
bool GetCurrentTextureDebug(GPUDebugBuffer &buffer, int level) override;

void DeviceLost();
Expand Down
6 changes: 4 additions & 2 deletions ext/native/thin3d/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "GLRenderManager.h"
#include "DataFormatGL.h"
#include "base/logging.h"
#include "base/stringutil.h"
#include "gfx/gl_common.h"
#include "gfx/gl_debug_log.h"
#include "gfx_es2/gpu_features.h"
Expand Down Expand Up @@ -156,9 +157,10 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps) {

#ifdef _WIN32
OutputDebugStringUTF8(buf);
OutputDebugStringUTF8(vsCode);
if (vsCode)
OutputDebugStringUTF8(LineNumberString(vsCode).c_str());
if (fsCode)
OutputDebugStringUTF8(fsCode);
OutputDebugStringUTF8(LineNumberString(fsCode).c_str());
#endif
delete[] buf;
} else {
Expand Down

10 comments on commit fb7a63b

@weihuoya
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

switch and if really slow on mobile, maybe costed 20% performance.

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They aren't great on mobile GPUs, but is the 20% a number you measured? For this shader, all threads in each group will always be going the same way through the switches and ifs, so the penalty should be minimal if the GPU isn't ancient. Does this actually create a performance problem for you @weihuoya ?

@weihuoya
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snapdragon 820, sonic rivals 2 run around 24fps, rm switch and if run full speed.

@weihuoya
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GLES depal shader set int with highp precision, then it works on my phone.

WRITE(p, "precision highp int;\n");

without this line, color is wrong.

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @weihuoya , that's with Vulkan and not in the (disabled) GL depal mode? I'm gonna have to go look at the spec to see when that should be needed, but looks right.

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noticed that for whatever reason we accidentally specify a too low GLSL version when compiling vulkan shaders. I'm gonna try to fix that, stay tuned..

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wait just noticed what commit you're commenting on here, you gotta be meaning GL not vulkan. That's good at least and explains it. I will add the line.

@unknownbrackets
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which backend does that happen on? Can you open an issue?

-[Unknown]

@weihuoya
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

last commit make shader error

	WRITE(p, "#define highp\n");

	if (shaderDepal) {
		WRITE(p, "precision highp int;\n");
	}

this line only for mobile, not PC

WRITE(p, "precision highp int;\n");

@hrydgard
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops. I'll fix it soon.

Please sign in to comment.