Skip to content

Commit

Permalink
WIP vulkan: add exposure pass
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryp committed Mar 1, 2024
1 parent 87988c4 commit 46218d2
Show file tree
Hide file tree
Showing 14 changed files with 828 additions and 6 deletions.
4 changes: 4 additions & 0 deletions src/renderer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ target_sources(${target} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/MeshletCulling.h
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/DebugGeometryRenderPass.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/DebugGeometryRenderPass.h
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ExposurePass.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ExposurePass.h
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ForwardPass.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ForwardPass.h
${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/FrameGraphPass.cpp
Expand Down Expand Up @@ -161,6 +163,8 @@ set(REAPER_SHADER_SRCS
${REAPER_SHADER_DIR}/debug_geometry/build_cmds.comp.hlsl
${REAPER_SHADER_DIR}/debug_geometry/draw.frag.hlsl
${REAPER_SHADER_DIR}/debug_geometry/draw.vert.hlsl
${REAPER_SHADER_DIR}/reduce_exposure.comp.hlsl
${REAPER_SHADER_DIR}/reduce_exposure_tail.comp.hlsl
${REAPER_SHADER_DIR}/forward.frag.hlsl
${REAPER_SHADER_DIR}/forward.vert.hlsl
${REAPER_SHADER_DIR}/fullscreen_triangle.vert.hlsl
Expand Down
142 changes: 142 additions & 0 deletions src/renderer/shader/reduce_exposure.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include "lib/base.hlsl"

#include "lib/color_space.hlsl"
#include "lib/morton.hlsl"

#include "reduce_exposure.share.hlsl"

// NOTE: untested
// This needs support for atomic adds and that's not there on my Intel iGPU
// See also: shaderSharedFloat32AtomicAdd.
#define ENABLE_SHARED_FLOAT_ATOMICS 0

VK_PUSH_CONSTANT_HELPER(ReduceExposurePassParams) Consts;

VK_BINDING(0, 0) SamplerState LinearClampSampler;
VK_BINDING(0, 1) Texture2D<float3> SceneHDR;
VK_BINDING(0, 2) RWTexture2D<float2> AvgLog2Luminance;

static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY;

#if ENABLE_SHARED_FLOAT_ATOMICS
groupshared float lds_sum_luma_log2;
groupshared float lds_sum_weight;
#else
groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount];
groupshared float lds_weight[ThreadCount / MinWaveLaneCount];
#endif

#if ENABLE_SHARED_FLOAT_ATOMICS
[numthreads(ExposureThreadCountX, ExposureThreadCountY, 1)]
#else
[numthreads(ThreadCount, 1, 1)]
#endif
void main(uint3 gtid : SV_GroupThreadID,
uint3 gid : SV_GroupID,
uint3 dtid : SV_DispatchThreadID,
uint gi : SV_GroupIndex)
{
#if ENABLE_SHARED_FLOAT_ATOMICS
if (gi == 0)
{
lds_sum_luma_log2 = 0.f;
lds_sum_weight = 0.f;
}

GroupMemoryBarrierWithGroupSync();
#endif

#if ENABLE_SHARED_FLOAT_ATOMICS
uint2 position_ts = dtid.xy;
#else
const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY);
uint2 position_ts = offset_ts + decode_morton_2d(gi);
#endif

position_ts = position_ts * 2 + 1;

const bool is_active = all(position_ts < Consts.extent_ts);

float sum_luma_log2 = 0.f;
float sum_weight = 0.f;

if (is_active)
{
float2 position_uv = (float2)position_ts * Consts.extent_ts_inv;

// We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions.
// This introduces a small inaccuracy that I'm ignoring.
// Hopefully none of those inaccuracies matter when using dynamic resolution.
float4 quad_r = SceneHDR.GatherRed(LinearClampSampler, position_uv);
float4 quad_g = SceneHDR.GatherGreen(LinearClampSampler, position_uv);
float4 quad_b = SceneHDR.GatherBlue(LinearClampSampler, position_uv);

float3 quad01_scene_color_srgb_linear = float3(quad_r.x, quad_g.x, quad_b.x);
float3 quad11_scene_color_srgb_linear = float3(quad_r.y, quad_g.y, quad_b.y);
float3 quad10_scene_color_srgb_linear = float3(quad_r.z, quad_g.z, quad_b.z);
float3 quad00_scene_color_srgb_linear = float3(quad_r.w, quad_g.w, quad_b.w);

float4 luma_log2 = log2(max(0.00001f, float4(
luma_srgb(quad01_scene_color_srgb_linear),
luma_srgb(quad11_scene_color_srgb_linear),
luma_srgb(quad10_scene_color_srgb_linear),
luma_srgb(quad00_scene_color_srgb_linear))));

sum_luma_log2 = dot(luma_log2, 1.f / 4.f);
sum_weight = 1.f;
}

sum_luma_log2 = WaveActiveSum(sum_luma_log2);
sum_weight = WaveActiveSum(sum_weight);

uint wave_lane_count = WaveGetLaneCount();

if (WaveIsFirstLane())
{
#if ENABLE_SHARED_FLOAT_ATOMICS
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
InterlockedAdd(lds_sum_weight, sum_weight);
#else
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
uint wave_index = gi / wave_lane_count;
lds_luma_log2[wave_index] = sum_luma_log2;
lds_weight[wave_index] = sum_weight;
#endif
}

#if ENABLE_SHARED_FLOAT_ATOMICS
GroupMemoryBarrierWithGroupSync();
#else
// Manual LDS reduce pass
uint active_threads = ThreadCount / wave_lane_count;

for (uint threads = active_threads / 2; threads > 0; threads /= 2)
{
GroupMemoryBarrierWithGroupSync();

if (gi < threads)
{
sum_luma_log2 += lds_luma_log2[gi + threads];
sum_weight += lds_weight[gi + threads];

lds_luma_log2[gi] = sum_luma_log2;
lds_weight[gi] = sum_weight;
}
}
#endif

if (gi == 0)
{
uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY);
uint2 output_position_ts = position_ts / (group_size * 2);

#if ENABLE_SHARED_FLOAT_ATOMICS
sum_luma_log2 = lds_sum_luma_log2;
sum_weight = lds_sum_weight;
#else
// NOTE: data is already in the register from LDS
#endif

AvgLog2Luminance[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount));
}
}
32 changes: 32 additions & 0 deletions src/renderer/shader/reduce_exposure.share.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
////////////////////////////////////////////////////////////////////////////////
/// Reaper
///
/// Copyright (c) 2015-2024 Thibault Schueller
/// This file is distributed under the MIT License
////////////////////////////////////////////////////////////////////////////////

#ifndef REDUCE_EXPOSURE_SHARE_INCLUDED
#define REDUCE_EXPOSURE_SHARE_INCLUDED

#include "shared_types.hlsl"

static const hlsl_uint MinWaveLaneCount = 8; // FIXME

static const hlsl_uint ExposureThreadCountX = 8;
static const hlsl_uint ExposureThreadCountY = 8;

struct ReduceExposurePassParams
{
hlsl_uint2 extent_ts;
hlsl_float2 extent_ts_inv;
};

struct ReduceExposureTailPassParams
{
hlsl_uint2 extent_ts;
hlsl_float2 extent_ts_inv;
hlsl_uint2 tail_extent_ts;
hlsl_uint last_thread_group_index;
};

#endif
172 changes: 172 additions & 0 deletions src/renderer/shader/reduce_exposure_tail.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#include "lib/base.hlsl"

#include "lib/color_space.hlsl"
#include "lib/morton.hlsl"

#include "reduce_exposure.share.hlsl"

VK_PUSH_CONSTANT_HELPER(ReduceExposureTailPassParams) Consts;

VK_BINDING(0, 0) SamplerState LinearClampSampler;
VK_BINDING(0, 1) Texture2D<float2> AvgLog2LuminanceInput;
VK_BINDING(0, 2) RWByteAddressBuffer AvgLog2LuminanceBuffer;
VK_BINDING(0, 3) globallycoherent RWByteAddressBuffer TailCounter;
VK_BINDING(0, 4) globallycoherent RWTexture2D<float2> AvgLog2LuminanceTail;

static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY;

groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount];
groupshared float lds_weight[ThreadCount / MinWaveLaneCount];
groupshared uint lds_thread_group_index;

[numthreads(ThreadCount, 1, 1)]
void main(uint3 gtid : SV_GroupThreadID,
uint3 gid : SV_GroupID,
uint3 dtid : SV_DispatchThreadID,
uint gi : SV_GroupIndex)
{
const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY);
uint2 position_ts = offset_ts + decode_morton_2d(gi);

position_ts = position_ts * 2 + 1;

const bool is_active = all(position_ts < Consts.extent_ts);

float sum_luma_log2 = 0.f;
float sum_weight = 0.f;

if (is_active)
{
float2 position_uv = (float2)position_ts * Consts.extent_ts_inv;

// We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions.
// This introduces a small inaccuracy that I'm ignoring.
// Hopefully none of those inaccuracies matter when using dynamic resolution.
// FIXME This might matter for the tail!
float2 luma_weight = AvgLog2LuminanceInput.Sample(LinearClampSampler, position_uv);

sum_luma_log2 = luma_weight.x;
sum_weight = luma_weight.y;
}

sum_luma_log2 = WaveActiveSum(sum_luma_log2);
sum_weight = WaveActiveSum(sum_weight);

uint wave_lane_count = WaveGetLaneCount();

if (WaveIsFirstLane())
{
#if ENABLE_SHARED_FLOAT_ATOMICS
// FIXME this needs support for atomic adds and that's not supported on my Intel iGPU
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
InterlockedAdd(lds_sum_weight, sum_weight);
#else
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
uint wave_index = gi / wave_lane_count;
lds_luma_log2[wave_index] = sum_luma_log2;
lds_weight[wave_index] = sum_weight;
#endif
}

// Manual LDS reduce pass
uint active_threads = ThreadCount / wave_lane_count;

for (uint threads = active_threads / 2; threads > 0; threads /= 2)
{
GroupMemoryBarrierWithGroupSync();

if (gi < threads)
{
sum_luma_log2 += lds_luma_log2[gi + threads];
sum_weight += lds_weight[gi + threads];

lds_luma_log2[gi] = sum_luma_log2;
lds_weight[gi] = sum_weight;
}
}

if (gi == 0)
{
uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY);
uint2 output_position_ts = position_ts / (group_size * 2);

// NOTE: data is already in the register from LDS
AvgLog2LuminanceTail[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount));

// Like AMD's SPD code, we're keeping the last thread group alive to make another go at the rest of the tail.
TailCounter.InterlockedAdd(0, uint(1), lds_thread_group_index);
}

GroupMemoryBarrierWithGroupSync();

// Exit if we're not the last thread group running
if (lds_thread_group_index != Consts.last_thread_group_index)
return;

// =========================================================================
// FIXME reduce the rest of the mip chain

position_ts = decode_morton_2d(gi); // FIXME

const bool is_active2 = all(position_ts < Consts.tail_extent_ts); // FIXME

sum_luma_log2 = 0.f; // FIXME
sum_weight = 0.f; // FIXME

if (is_active2)
{
float2 luma_weight = AvgLog2LuminanceTail[position_ts];

sum_luma_log2 = luma_weight.x;
sum_weight = luma_weight.y;
}

sum_luma_log2 = WaveActiveSum(sum_luma_log2);
sum_weight = WaveActiveSum(sum_weight);

// uint wave_lane_count = WaveGetLaneCount();

if (WaveIsFirstLane())
{
#if ENABLE_SHARED_FLOAT_ATOMICS
// FIXME this needs support for atomic adds and that's not supported on my Intel iGPU
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
InterlockedAdd(lds_sum_weight, sum_weight);
#else
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
uint wave_index = gi / wave_lane_count;
lds_luma_log2[wave_index] = sum_luma_log2;
lds_weight[wave_index] = sum_weight;
#endif
}

// Manual LDS reduce pass
//uint active_threads = ThreadCount / wave_lane_count;

for (uint threads = active_threads / 2; threads > 0; threads /= 2)
{
GroupMemoryBarrierWithGroupSync();

if (gi < threads)
{
sum_luma_log2 += lds_luma_log2[gi + threads];
sum_weight += lds_weight[gi + threads];

lds_luma_log2[gi] = sum_luma_log2;
lds_weight[gi] = sum_weight;
}
}

// =========================================================================

if (gi == 0)
{
// NOTE: data is already in the register from LDS
float average_luma_log = sum_luma_log2 / sum_weight;

AvgLog2LuminanceBuffer.Store(0, asuint(average_luma_log));

// Clear counter for next passes
TailCounter.Store(0, 0);
}
}
9 changes: 6 additions & 3 deletions src/renderer/shader/swapchain_write.frag.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ VK_BINDING(0, 0) SamplerState linear_sampler;
VK_BINDING(0, 1) Texture2D<float3> t_hdr_scene;
VK_BINDING(0, 2) Texture2D<float3> Lighting;
VK_BINDING(0, 3) Texture2D<float4> t_ldr_gui;
VK_BINDING(0, 4) Texture2D<float3> t_ldr_debug;
VK_BINDING(0, 4) ByteAddressBuffer AvgLog2Luminance;
VK_BINDING(0, 5) Texture2D<float3> t_ldr_debug;

struct PS_INPUT
{
Expand Down Expand Up @@ -81,8 +82,6 @@ float3 apply_tonemapping_operator(float3 color, uint tonemap_function)
return 0.42; // Invalid
}

static const float exposure = 1.f; // FIXME

void main(in PS_INPUT input, out PS_OUTPUT output)
{
// Unexposed scene color in linear sRGB
Expand All @@ -94,7 +93,11 @@ void main(in PS_INPUT input, out PS_OUTPUT output)
color = lighting;
}

float average_log2_luma = asfloat(AvgLog2Luminance.Load(0));
float exposure = exp2(-average_log2_luma) * 0.18;

color *= exposure;

color = apply_tonemapping_operator(color, spec_tonemap_function);

if (false)
Expand Down
Loading

0 comments on commit 46218d2

Please sign in to comment.