Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process rendered CLUTs on the GPU #8246

Closed
wants to merge 11 commits into from
61 changes: 56 additions & 5 deletions GPU/Common/DepalettizeShaderCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include "Common/Log.h"
#include "Core/Reporting.h"
#include "GPU/GPUState.h"
#include "GPU/GLES/GLStateCache.h"
#include "GPU/Common/DepalettizeShaderCommon.h"

#include "GPU/Directx9/PixelShaderGeneratorDX9.h"
#include "GPU/GLES/GLStateCache.h"

#define WRITE p+=sprintf

Expand All @@ -48,6 +48,7 @@ void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat, ShaderLang
WRITE(p, "out vec4 fragColor0;\n");
WRITE(p, "uniform sampler2D tex;\n");
WRITE(p, "uniform sampler2D pal;\n");
WRITE(p, "uniform vec2 u_offset;\n");
}

// TODO: Add support for integer textures. Though it hardly matters.
Expand Down Expand Up @@ -110,7 +111,7 @@ void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat, ShaderLang
WRITE(p, ";\n");
}

WRITE(p, " fragColor0 = texture(pal, vec2((float(index) + 0.5) * (1.0 / %f), 0.0));\n", texturePixels);
WRITE(p, " fragColor0 = texture(pal, vec2((float(index) + 0.5) * %f * u_offset.x + u_offset.y, 0.0));\n", 1.0 / texturePixels);
WRITE(p, "}\n");
}

Expand Down Expand Up @@ -239,17 +240,19 @@ void GenerateDepalShaderFloat(char *buffer, GEBufferFormat pixelFormat, ShaderLa
WRITE(p, "varying vec2 v_texcoord0;\n");
WRITE(p, "uniform sampler2D tex;\n");
WRITE(p, "uniform sampler2D pal;\n");
WRITE(p, "uniform vec2 u_offset;\n");
WRITE(p, "void main() {\n");
WRITE(p, " vec4 index = texture2D(tex, v_texcoord0);\n");
WRITE(p, " float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
WRITE(p, " float coord = ((%s * %f)%s) * u_offset.x + u_offset.y;\n", lookupMethod, index_multiplier, offset);
WRITE(p, " gl_FragColor = texture2D(pal, vec2(coord, 0.0));\n");
WRITE(p, "}\n");
} else if (lang == HLSL_DX9) {
WRITE(p, "sampler tex: register(s0);\n");
WRITE(p, "sampler pal: register(s1);\n");
WRITE(p, "float2 u_offset : register(c%i);\n", CONST_PS_DEPAL_OFFSET);
WRITE(p, "float4 main(float2 v_texcoord0 : TEXCOORD0) : COLOR0 {\n");
WRITE(p, " float4 index = tex2D(tex, v_texcoord0);\n");
WRITE(p, " float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
WRITE(p, " float coord = ((%s * %f)%s) * u_offset.x + u_offset.y;\n", lookupMethod, index_multiplier, offset);
WRITE(p, " return tex2D(pal, float2(coord, 0.0)).bgra;\n");
WRITE(p, "}\n");
}
Expand All @@ -270,4 +273,52 @@ void GenerateDepalShader(char *buffer, GEBufferFormat pixelFormat, ShaderLanguag
}
}

void GenerateIndexedShader(char *buffer, ShaderLanguage lang) {
char *p = buffer;

if (lang == GLSL_140) {
if (gl_extensions.IsGLES) {
WRITE(p, "#version 100\n");
WRITE(p, "precision mediump float;\n");
} else {
WRITE(p, "#version 110\n");
}
WRITE(p, "varying vec2 v_texcoord0;\n");
WRITE(p, "uniform sampler2D tex;\n");
WRITE(p, "uniform sampler2D pal;\n");
WRITE(p, "uniform vec2 u_offset;\n");
WRITE(p, "void main() {\n");
WRITE(p, " vec4 index = texture2D(tex, v_texcoord0);\n");
WRITE(p, " float coord = index.r * u_offset.x + u_offset.y;\n");
WRITE(p, " gl_FragColor = texture2D(pal, vec2(coord, 0.0));\n");
WRITE(p, "}\n");
} else if (lang == GLSL_300) {
if (gl_extensions.IsGLES) {
WRITE(p, "#version 300 es\n");
WRITE(p, "precision mediump float;\n");
} else {
WRITE(p, "#version 330\n");
}
WRITE(p, "in vec2 v_texcoord0;\n");
WRITE(p, "out vec4 fragColor0;\n");
WRITE(p, "uniform sampler2D tex;\n");
WRITE(p, "uniform sampler2D pal;\n");
WRITE(p, "uniform vec2 u_offset;\n");
WRITE(p, "void main() {\n");
WRITE(p, " vec4 index = texture(tex, v_texcoord0);\n");
WRITE(p, " float coord = index.r * u_offset.x + u_offset.y;\n");
WRITE(p, " fragColor0 = texture(pal, vec2(coord, 0.0));\n");
WRITE(p, "}\n");
} else if (lang == HLSL_DX9) {
WRITE(p, "sampler tex: register(s0);\n");
WRITE(p, "sampler pal: register(s1);\n");
WRITE(p, "float2 u_offset : register(c%i);\n", CONST_PS_DEPAL_OFFSET);
WRITE(p, "float4 main(float2 v_texcoord0 : TEXCOORD0) : COLOR0 {\n");
WRITE(p, " float4 index = tex2D(tex, v_texcoord0);\n");
WRITE(p, " float coord = index.r * u_offset.x + u_offset.y;\n");
WRITE(p, " return tex2D(pal, float2(coord, 0.0)).bgra;\n");
WRITE(p, "}\n");
}
}

#undef WRITE
1 change: 1 addition & 0 deletions GPU/Common/DepalettizeShaderCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ enum ShaderLanguage {
};

void GenerateDepalShader(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage language);
void GenerateIndexedShader(char *buffer, ShaderLanguage lang);
2 changes: 2 additions & 0 deletions GPU/Common/FramebufferCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ enum {
FB_USAGE_RENDERTARGET = 2,
FB_USAGE_TEXTURE = 4,
FB_USAGE_CLUT = 8,

FB_USAGE_KEEP = FB_USAGE_CLUT,
};

enum {
Expand Down
128 changes: 121 additions & 7 deletions GPU/Common/TextureCacheCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
clutRenderAddress_ = 0xFFFFFFFF;

if (Memory::IsValidAddress(clutAddr)) {
if (Memory::IsVRAMAddress(clutAddr)) {
if (Memory::IsVRAMAddress(clutAddr) && !g_Config.bDisableSlowFramebufEffects) {
// Clear the uncached bit, etc. to match framebuffers.
const u32 clutFramebufAddr = clutAddr & 0x3FFFFFFF;
const u32 clutFramebufEnd = clutFramebufAddr + loadBytes;
Expand Down Expand Up @@ -352,11 +352,16 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {

// It's possible for a game to (successfully) access outside valid memory.
u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
if (clutRenderAddress_ != 0xFFFFFFFF && !g_Config.bDisableSlowFramebufEffects) {
DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes);
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < loadBytes) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
if (clutRenderAddress_ != 0xFFFFFFFF) {
bool useIndexed = standardScaleFactor_ == 1;

if (!useIndexed) {
DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes);
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < loadBytes) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
}
clutRenderAddress_ = 0xFFFFFFFF;
}
} else {
#ifdef _M_SSE
Expand Down Expand Up @@ -386,7 +391,7 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
} else {
memset(clutBufRaw_, 0x00, loadBytes);
}
// Reload the clut next time.
// Update the clut (translating colors if necessary) next time.
clutLastFormat_ = 0xFFFFFFFF;
clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes);
}
Expand All @@ -411,6 +416,115 @@ void TextureCacheCommon::UnswizzleFromMem(u32 *dest, u32 destPitch, const u8 *te
DoUnswizzleTex16(texptr, dest, bxc, byc, destPitch);
}

void *TextureCacheCommon::DecodeLevelToIndexed(GETextureFormat format, int level, int *bufwout) {
u32 texaddr = gstate.getTextureAddress(level);
bool swizzled = gstate.isTextureSwizzled();
if ((texaddr & 0x00600000) != 0 && Memory::IsVRAMAddress(texaddr)) {
// This means it's in a mirror, possibly a swizzled mirror. Let's report.
WARN_LOG_REPORT_ONCE(texmirror, G3D, "Decoding texture from VRAM mirror at %08x swizzle=%d", texaddr, swizzled ? 1 : 0);
if ((texaddr & 0x00200000) == 0x00200000) {
// Technically 2 and 6 are slightly different, but this is better than nothing probably.
swizzled = !swizzled;
}
// Note that (texaddr & 0x00600000) == 0x00600000 is very likely to be depth texturing.
}

int bufw = GetTextureBufw(level, texaddr, format);
if (bufwout)
*bufwout = bufw;
int w = gstate.getTextureWidth(level);
int h = gstate.getTextureHeight(level);
const u8 *texptr = Memory::GetPointer(texaddr);

tmpTexBuf16.resize(std::max(bufw, w) * h);
tmpTexBuf32.resize(std::max(bufw, w) * h);
tmpTexBufRearrange.resize(std::max(bufw, w) * h);

u8 *finalBuf = (u8 *)tmpTexBuf16.data();
switch (format) {
case GE_TFMT_CLUT4:
{
const bool mipmapShareClut = gstate.isClutSharedForMipmaps();
const int clutSharingOffset = mipmapShareClut ? 0 : level * 16;

const u8 *indexed = texptr;
if (swizzled) {
UnswizzleFromMem(tmpTexBuf32.data(), bufw / 2, texptr, bufw, h, 0);
indexed = (const u8 *)tmpTexBuf32.data();
}

for (int i = 0; i < bufw * h; i += 2) {
u8 index = *indexed++;
finalBuf[i + 0] = gstate.transformClutIndex((index >> 0) & 0xf) + clutSharingOffset;
finalBuf[i + 1] = gstate.transformClutIndex((index >> 4) & 0xf) + clutSharingOffset;
}
}
break;

case GE_TFMT_CLUT8:
{
const u8 *indexed = texptr;
if (swizzled) {
UnswizzleFromMem(tmpTexBuf32.data(), bufw, texptr, bufw, h, 1);
indexed = (const u8 *)tmpTexBuf32.data();
}

for (int i = 0; i < bufw * h; ++i) {
finalBuf[i] = gstate.transformClutIndex(*indexed++);
}
}
break;

case GE_TFMT_CLUT16:
{
const u16_le *indexed = (const u16_le *)texptr;
if (swizzled) {
UnswizzleFromMem(tmpTexBuf32.data(), bufw * 2, texptr, bufw, h, 2);
indexed = (const u16_le *)tmpTexBuf32.data();
}

for (int i = 0; i < bufw * h; ++i) {
finalBuf[i] = gstate.transformClutIndex(*indexed++);
}
}
break;

case GE_TFMT_CLUT32:
{
const u32_le *indexed = (const u32_le *)texptr;
if (swizzled) {
UnswizzleFromMem(tmpTexBuf32.data(), bufw * 4, texptr, bufw, h, 4);
indexed = (const u32_le *)tmpTexBuf32.data();
}

for (int i = 0; i < bufw * h; ++i) {
finalBuf[i] = gstate.transformClutIndex(*indexed++);
}
}
break;

case GE_TFMT_4444:
case GE_TFMT_5551:
case GE_TFMT_5650:
case GE_TFMT_8888:
case GE_TFMT_DXT1:
case GE_TFMT_DXT3:
case GE_TFMT_DXT5:
default:
ERROR_LOG_REPORT(G3D, "Invalid indexed format %d", format);
return nullptr;
}

// Technically, the index can actually be up to 512. This is pretty rare (getClutIndexStartPos.)
// Unfortunately, not all platforms support uploading > 8 bit values.
if (gstate.getClutPaletteFormat() != GE_CMODE_32BIT_ABGR8888 && (gstate.getClutIndexStartPos() & 0x100) != 0) {
ERROR_LOG_REPORT(G3D, "Unsupported indexed texture with CLUT indexes outside 0-255");
}

// TODO: Change to using an output and stride.
return finalBuf;
}

bool TextureCacheCommon::GetCurrentClutBuffer(GPUDebugBuffer &buffer) {
const u32 bpp = gstate.getClutPaletteFormat() == GE_CMODE_32BIT_ABGR8888 ? 4 : 2;
const u32 pixels = 1024 / bpp;
Expand Down
2 changes: 2 additions & 0 deletions GPU/Common/TextureCacheCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class TextureCacheCommon {
STATUS_TO_SCALE = 0x80, // Pending texture scaling in a later frame.
STATUS_IS_SCALED = 0x100, // Has been scaled (can't be replaceImages'd.)
STATUS_FREE_CHANGE = 0x200, // Allow one change before marking "frequent".
STATUS_INDEXED = 04200, // Texture is R only for on-GPU CLUT processing.
};

// Status, but int so we can zero initialize.
Expand Down Expand Up @@ -153,6 +154,7 @@ class TextureCacheCommon {
};

bool DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA = false);
void *DecodeLevelToIndexed(GETextureFormat format, int level, int *bufwout);
void UnswizzleFromMem(u32 *dest, u32 destPitch, const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel);
bool ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw);

Expand Down
30 changes: 29 additions & 1 deletion GPU/Directx9/DepalettizeShaderDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ LPDIRECT3DPIXELSHADER9 DepalShaderCacheDX9::GetDepalettizePixelShader(GEPaletteF
}

char *buffer = new char[2048];

GenerateDepalShader(buffer, pixelFormat, HLSL_DX9);

LPDIRECT3DPIXELSHADER9 pshader;
Expand All @@ -172,4 +171,33 @@ LPDIRECT3DPIXELSHADER9 DepalShaderCacheDX9::GetDepalettizePixelShader(GEPaletteF
return depal->pixelShader;
}

LPDIRECT3DPIXELSHADER9 DepalShaderCacheDX9::GetIndexedPixelShader() {
if (indexedShader_.pixelShader != nullptr) {
if (indexedShader_.pixelShader == (LPDIRECT3DPIXELSHADER9)-1) {
// Previously failed. Don't try again.
return nullptr;
}
return indexedShader_.pixelShader;
}

char *buffer = new char[2048];
GenerateIndexedShader(buffer, HLSL_DX9);

std::string errorMessage;
if (!CompilePixelShader(buffer, &indexedShader_.pixelShader, NULL, errorMessage)) {
ERROR_LOG(G3D, "Failed to compile depal pixel shader: %s\n\n%s", buffer, errorMessage.c_str());
indexedShader_.pixelShader = nullptr;
}

delete[] buffer;

if (indexedShader_.pixelShader == nullptr) {
// So that we know not to try again next time.
indexedShader_.pixelShader = (LPDIRECT3DPIXELSHADER9)-1;
return nullptr;
}

return indexedShader_.pixelShader;
}

} // namespace
5 changes: 5 additions & 0 deletions GPU/Directx9/DepalettizeShaderDX9.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ namespace DX9 {

class DepalShaderDX9 {
public:
DepalShaderDX9() : pixelShader(nullptr) {
}

LPDIRECT3DPIXELSHADER9 pixelShader;
};

Expand All @@ -45,6 +48,7 @@ class DepalShaderCacheDX9 {
LPDIRECT3DPIXELSHADER9 GetDepalettizePixelShader(GEPaletteFormat clutFormat, GEBufferFormat pixelFormat);
LPDIRECT3DVERTEXSHADER9 GetDepalettizeVertexShader() { return vertexShader_; }
LPDIRECT3DTEXTURE9 GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32 *rawClut);
LPDIRECT3DPIXELSHADER9 GetIndexedPixelShader();
void Clear();
void Decimate();

Expand All @@ -54,6 +58,7 @@ class DepalShaderCacheDX9 {
LPDIRECT3DVERTEXSHADER9 vertexShader_;
std::map<u32, DepalShaderDX9 *> cache_;
std::map<u32, DepalTextureDX9 *> texCache_;
DepalShaderDX9 indexedShader_;
};

} // namespace
2 changes: 1 addition & 1 deletion GPU/Directx9/FramebufferDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ namespace DX9 {
UpdateFramebufUsage(vfb);

if (vfb != displayFramebuf_ && vfb != prevDisplayFramebuf_ && vfb != prevPrevDisplayFramebuf_) {
if (age > FBO_OLD_AGE) {
if (age > FBO_OLD_AGE && (vfb->usageFlags & FB_USAGE_KEEP) == 0) {
INFO_LOG(SCEGE, "Decimating FBO for %08x (%i x %i x %i), age %i", vfb->fb_address, vfb->width, vfb->height, vfb->format, age);
DestroyFramebuf(vfb);
vfbs_.erase(vfbs_.begin() + i--);
Expand Down
3 changes: 3 additions & 0 deletions GPU/Directx9/PixelShaderGeneratorDX9.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,7 @@ bool GenerateFragmentShaderDX9(const ShaderID &id, char *buffer);
// For stencil upload
#define CONST_PS_STENCILVALUE 10

// For depal
#define CONST_PS_DEPAL_OFFSET 11

};
Loading