WIP vulkan: add exposure pass

Ryp · Mar 1, 2024 · 46218d2 · 46218d2
1 parent 87988c4
commit 46218d2
Show file tree

Hide file tree

Showing 14 changed files with 828 additions and 6 deletions.
diff --git a/src/renderer/CMakeLists.txt b/src/renderer/CMakeLists.txt
@@ -109,6 +109,8 @@ target_sources(${target} PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/MeshletCulling.h
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/DebugGeometryRenderPass.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/DebugGeometryRenderPass.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ExposurePass.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ExposurePass.h
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ForwardPass.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/ForwardPass.h
     ${CMAKE_CURRENT_SOURCE_DIR}/vulkan/renderpass/FrameGraphPass.cpp
@@ -161,6 +163,8 @@ set(REAPER_SHADER_SRCS
     ${REAPER_SHADER_DIR}/debug_geometry/build_cmds.comp.hlsl
     ${REAPER_SHADER_DIR}/debug_geometry/draw.frag.hlsl
     ${REAPER_SHADER_DIR}/debug_geometry/draw.vert.hlsl
+    ${REAPER_SHADER_DIR}/reduce_exposure.comp.hlsl
+    ${REAPER_SHADER_DIR}/reduce_exposure_tail.comp.hlsl
     ${REAPER_SHADER_DIR}/forward.frag.hlsl
     ${REAPER_SHADER_DIR}/forward.vert.hlsl
     ${REAPER_SHADER_DIR}/fullscreen_triangle.vert.hlsl

diff --git a/src/renderer/shader/reduce_exposure.comp.hlsl b/src/renderer/shader/reduce_exposure.comp.hlsl
@@ -0,0 +1,142 @@
+#include "lib/base.hlsl"
+
+#include "lib/color_space.hlsl"
+#include "lib/morton.hlsl"
+
+#include "reduce_exposure.share.hlsl"
+
+// NOTE: untested
+// This needs support for atomic adds and that's not there on my Intel iGPU
+// See also: shaderSharedFloat32AtomicAdd.
+#define ENABLE_SHARED_FLOAT_ATOMICS 0
+
+VK_PUSH_CONSTANT_HELPER(ReduceExposurePassParams) Consts;
+
+VK_BINDING(0, 0) SamplerState LinearClampSampler;
+VK_BINDING(0, 1) Texture2D<float3> SceneHDR;
+VK_BINDING(0, 2) RWTexture2D<float2> AvgLog2Luminance;
+
+static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY;
+
+#if ENABLE_SHARED_FLOAT_ATOMICS
+groupshared float lds_sum_luma_log2;
+groupshared float lds_sum_weight;
+#else
+groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount];
+groupshared float lds_weight[ThreadCount / MinWaveLaneCount];
+#endif
+
+#if ENABLE_SHARED_FLOAT_ATOMICS
+[numthreads(ExposureThreadCountX, ExposureThreadCountY, 1)]
+#else
+[numthreads(ThreadCount, 1, 1)]
+#endif
+void main(uint3 gtid : SV_GroupThreadID,
+          uint3 gid  : SV_GroupID,
+          uint3 dtid : SV_DispatchThreadID,
+          uint  gi   : SV_GroupIndex)
+{
+#if ENABLE_SHARED_FLOAT_ATOMICS
+    if (gi == 0)
+    {
+        lds_sum_luma_log2 = 0.f;
+        lds_sum_weight = 0.f;
+    }
+
+    GroupMemoryBarrierWithGroupSync();
+#endif
+
+#if ENABLE_SHARED_FLOAT_ATOMICS
+    uint2 position_ts = dtid.xy;
+#else
+    const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY);
+    uint2 position_ts = offset_ts + decode_morton_2d(gi);
+#endif
+
+    position_ts = position_ts * 2 + 1;
+
+    const bool is_active = all(position_ts < Consts.extent_ts);
+
+    float sum_luma_log2 = 0.f;
+    float sum_weight = 0.f;
+
+    if (is_active)
+    {
+        float2 position_uv = (float2)position_ts * Consts.extent_ts_inv;
+
+        // We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions.
+        // This introduces a small inaccuracy that I'm ignoring.
+        // Hopefully none of those inaccuracies matter when using dynamic resolution.
+        float4 quad_r = SceneHDR.GatherRed(LinearClampSampler, position_uv);
+        float4 quad_g = SceneHDR.GatherGreen(LinearClampSampler, position_uv);
+        float4 quad_b = SceneHDR.GatherBlue(LinearClampSampler, position_uv);
+
+        float3 quad01_scene_color_srgb_linear = float3(quad_r.x, quad_g.x, quad_b.x);
+        float3 quad11_scene_color_srgb_linear = float3(quad_r.y, quad_g.y, quad_b.y);
+        float3 quad10_scene_color_srgb_linear = float3(quad_r.z, quad_g.z, quad_b.z);
+        float3 quad00_scene_color_srgb_linear = float3(quad_r.w, quad_g.w, quad_b.w);
+
+        float4 luma_log2 = log2(max(0.00001f, float4(
+            luma_srgb(quad01_scene_color_srgb_linear),
+            luma_srgb(quad11_scene_color_srgb_linear),
+            luma_srgb(quad10_scene_color_srgb_linear),
+            luma_srgb(quad00_scene_color_srgb_linear))));
+
+        sum_luma_log2 = dot(luma_log2, 1.f / 4.f);
+        sum_weight = 1.f;
+    }
+
+    sum_luma_log2 = WaveActiveSum(sum_luma_log2);
+    sum_weight = WaveActiveSum(sum_weight);
+
+    uint wave_lane_count = WaveGetLaneCount();
+
+    if (WaveIsFirstLane())
+    {
+#if ENABLE_SHARED_FLOAT_ATOMICS
+        InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
+        InterlockedAdd(lds_sum_weight, sum_weight);
+#else
+        // NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
+        uint wave_index = gi / wave_lane_count;
+        lds_luma_log2[wave_index] = sum_luma_log2;
+        lds_weight[wave_index] = sum_weight;
+#endif
+    }
+
+#if ENABLE_SHARED_FLOAT_ATOMICS
+    GroupMemoryBarrierWithGroupSync();
+#else
+    // Manual LDS reduce pass
+    uint active_threads = ThreadCount / wave_lane_count;
+
+    for (uint threads = active_threads / 2; threads > 0; threads /= 2)
+    {
+        GroupMemoryBarrierWithGroupSync();
+
+        if (gi < threads)
+        {
+            sum_luma_log2 += lds_luma_log2[gi + threads];
+            sum_weight += lds_weight[gi + threads];
+
+            lds_luma_log2[gi] = sum_luma_log2;
+            lds_weight[gi] = sum_weight;
+        }
+    }
+#endif
+
+    if (gi == 0)
+    {
+        uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY);
+        uint2 output_position_ts = position_ts / (group_size * 2);
+
+#if ENABLE_SHARED_FLOAT_ATOMICS
+        sum_luma_log2 = lds_sum_luma_log2;
+        sum_weight = lds_sum_weight;
+#else
+        // NOTE: data is already in the register from LDS
+#endif
+
+        AvgLog2Luminance[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount));
+    }
+}
diff --git a/src/renderer/shader/reduce_exposure.share.hlsl b/src/renderer/shader/reduce_exposure.share.hlsl
@@ -0,0 +1,32 @@
+////////////////////////////////////////////////////////////////////////////////
+/// Reaper
+///
+/// Copyright (c) 2015-2024 Thibault Schueller
+/// This file is distributed under the MIT License
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef REDUCE_EXPOSURE_SHARE_INCLUDED
+#define REDUCE_EXPOSURE_SHARE_INCLUDED
+
+#include "shared_types.hlsl"
+
+static const hlsl_uint MinWaveLaneCount = 8; // FIXME
+
+static const hlsl_uint ExposureThreadCountX = 8;
+static const hlsl_uint ExposureThreadCountY = 8;
+
+struct ReduceExposurePassParams
+{
+    hlsl_uint2  extent_ts;
+    hlsl_float2 extent_ts_inv;
+};
+
+struct ReduceExposureTailPassParams
+{
+    hlsl_uint2  extent_ts;
+    hlsl_float2 extent_ts_inv;
+    hlsl_uint2  tail_extent_ts;
+    hlsl_uint   last_thread_group_index;
+};
+
+#endif
diff --git a/src/renderer/shader/reduce_exposure_tail.comp.hlsl b/src/renderer/shader/reduce_exposure_tail.comp.hlsl
@@ -0,0 +1,172 @@
+#include "lib/base.hlsl"
+
+#include "lib/color_space.hlsl"
+#include "lib/morton.hlsl"
+
+#include "reduce_exposure.share.hlsl"
+
+VK_PUSH_CONSTANT_HELPER(ReduceExposureTailPassParams) Consts;
+
+VK_BINDING(0, 0) SamplerState LinearClampSampler;
+VK_BINDING(0, 1) Texture2D<float2> AvgLog2LuminanceInput;
+VK_BINDING(0, 2) RWByteAddressBuffer AvgLog2LuminanceBuffer;
+VK_BINDING(0, 3) globallycoherent RWByteAddressBuffer TailCounter;
+VK_BINDING(0, 4) globallycoherent RWTexture2D<float2> AvgLog2LuminanceTail;
+
+static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY;
+
+groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount];
+groupshared float lds_weight[ThreadCount / MinWaveLaneCount];
+groupshared uint lds_thread_group_index;
+
+[numthreads(ThreadCount, 1, 1)]
+void main(uint3 gtid : SV_GroupThreadID,
+          uint3 gid  : SV_GroupID,
+          uint3 dtid : SV_DispatchThreadID,
+          uint  gi   : SV_GroupIndex)
+{
+    const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY);
+    uint2 position_ts = offset_ts + decode_morton_2d(gi);
+
+    position_ts = position_ts * 2 + 1;
+
+    const bool is_active = all(position_ts < Consts.extent_ts);
+
+    float sum_luma_log2 = 0.f;
+    float sum_weight = 0.f;
+
+    if (is_active)
+    {
+        float2 position_uv = (float2)position_ts * Consts.extent_ts_inv;
+
+        // We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions.
+        // This introduces a small inaccuracy that I'm ignoring.
+        // Hopefully none of those inaccuracies matter when using dynamic resolution.
+        // FIXME This might matter for the tail!
+        float2 luma_weight = AvgLog2LuminanceInput.Sample(LinearClampSampler, position_uv);
+
+        sum_luma_log2 = luma_weight.x;
+        sum_weight = luma_weight.y;
+    }
+
+    sum_luma_log2 = WaveActiveSum(sum_luma_log2);
+    sum_weight = WaveActiveSum(sum_weight);
+
+    uint wave_lane_count = WaveGetLaneCount();
+
+    if (WaveIsFirstLane())
+    {
+#if ENABLE_SHARED_FLOAT_ATOMICS
+        // FIXME this needs support for atomic adds and that's not supported on my Intel iGPU
+        InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
+        InterlockedAdd(lds_sum_weight, sum_weight);
+#else
+        // NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
+        uint wave_index = gi / wave_lane_count;
+        lds_luma_log2[wave_index] = sum_luma_log2;
+        lds_weight[wave_index] = sum_weight;
+#endif
+    }
+
+    // Manual LDS reduce pass
+    uint active_threads = ThreadCount / wave_lane_count;
+
+    for (uint threads = active_threads / 2; threads > 0; threads /= 2)
+    {
+        GroupMemoryBarrierWithGroupSync();
+
+        if (gi < threads)
+        {
+            sum_luma_log2 += lds_luma_log2[gi + threads];
+            sum_weight += lds_weight[gi + threads];
+
+            lds_luma_log2[gi] = sum_luma_log2;
+            lds_weight[gi] = sum_weight;
+        }
+    }
+
+    if (gi == 0)
+    {
+        uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY);
+        uint2 output_position_ts = position_ts / (group_size * 2);
+
+        // NOTE: data is already in the register from LDS
+        AvgLog2LuminanceTail[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount));
+
+        // Like AMD's SPD code, we're keeping the last thread group alive to make another go at the rest of the tail.
+        TailCounter.InterlockedAdd(0, uint(1), lds_thread_group_index);
+    }
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Exit if we're not the last thread group running
+    if (lds_thread_group_index != Consts.last_thread_group_index)
+        return;
+
+    // =========================================================================
+    // FIXME reduce the rest of the mip chain
+
+    position_ts = decode_morton_2d(gi); // FIXME
+
+    const bool is_active2 = all(position_ts < Consts.tail_extent_ts); // FIXME
+
+    sum_luma_log2 = 0.f; // FIXME
+    sum_weight = 0.f; // FIXME
+
+    if (is_active2)
+    {
+        float2 luma_weight = AvgLog2LuminanceTail[position_ts];
+
+        sum_luma_log2 = luma_weight.x;
+        sum_weight = luma_weight.y;
+    }
+
+    sum_luma_log2 = WaveActiveSum(sum_luma_log2);
+    sum_weight = WaveActiveSum(sum_weight);
+
+    // uint wave_lane_count = WaveGetLaneCount();
+
+    if (WaveIsFirstLane())
+    {
+#if ENABLE_SHARED_FLOAT_ATOMICS
+        // FIXME this needs support for atomic adds and that's not supported on my Intel iGPU
+        InterlockedAdd(lds_sum_luma_log2, sum_luma_log2);
+        InterlockedAdd(lds_sum_weight, sum_weight);
+#else
+        // NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT
+        uint wave_index = gi / wave_lane_count;
+        lds_luma_log2[wave_index] = sum_luma_log2;
+        lds_weight[wave_index] = sum_weight;
+#endif
+    }
+
+    // Manual LDS reduce pass
+    //uint active_threads = ThreadCount / wave_lane_count;
+
+    for (uint threads = active_threads / 2; threads > 0; threads /= 2)
+    {
+        GroupMemoryBarrierWithGroupSync();
+
+        if (gi < threads)
+        {
+            sum_luma_log2 += lds_luma_log2[gi + threads];
+            sum_weight += lds_weight[gi + threads];
+
+            lds_luma_log2[gi] = sum_luma_log2;
+            lds_weight[gi] = sum_weight;
+        }
+    }
+
+    // =========================================================================
+
+    if (gi == 0)
+    {
+        // NOTE: data is already in the register from LDS
+        float average_luma_log = sum_luma_log2 / sum_weight;
+
+        AvgLog2LuminanceBuffer.Store(0, asuint(average_luma_log));
+
+        // Clear counter for next passes
+        TailCounter.Store(0, 0);
+    }
+}
diff --git a/src/renderer/shader/swapchain_write.frag.hlsl b/src/renderer/shader/swapchain_write.frag.hlsl
@@ -14,7 +14,8 @@ VK_BINDING(0, 0) SamplerState linear_sampler;
 VK_BINDING(0, 1) Texture2D<float3> t_hdr_scene;
 VK_BINDING(0, 2) Texture2D<float3> Lighting;
 VK_BINDING(0, 3) Texture2D<float4> t_ldr_gui;
-VK_BINDING(0, 4) Texture2D<float3> t_ldr_debug;
+VK_BINDING(0, 4) ByteAddressBuffer AvgLog2Luminance;
+VK_BINDING(0, 5) Texture2D<float3> t_ldr_debug;
 
 struct PS_INPUT
 {
@@ -81,8 +82,6 @@ float3 apply_tonemapping_operator(float3 color, uint tonemap_function)
         return 0.42; // Invalid
 }
 
-static const float exposure = 1.f; // FIXME
-
 void main(in PS_INPUT input, out PS_OUTPUT output)
 {
     // Unexposed scene color in linear sRGB
@@ -94,7 +93,11 @@ void main(in PS_INPUT input, out PS_OUTPUT output)
         color = lighting;
     }
 
+    float average_log2_luma = asfloat(AvgLog2Luminance.Load(0));
+    float exposure = exp2(-average_log2_luma) * 0.18;
+
     color *= exposure;
+
     color = apply_tonemapping_operator(color, spec_tonemap_function);
 
     if (false)