-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
828 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
#include "lib/base.hlsl" | ||
|
||
#include "lib/color_space.hlsl" | ||
#include "lib/morton.hlsl" | ||
|
||
#include "reduce_exposure.share.hlsl" | ||
|
||
// NOTE: untested | ||
// This needs support for atomic adds and that's not there on my Intel iGPU | ||
// See also: shaderSharedFloat32AtomicAdd. | ||
#define ENABLE_SHARED_FLOAT_ATOMICS 0 | ||
|
||
VK_PUSH_CONSTANT_HELPER(ReduceExposurePassParams) Consts; | ||
|
||
VK_BINDING(0, 0) SamplerState LinearClampSampler; | ||
VK_BINDING(0, 1) Texture2D<float3> SceneHDR; | ||
VK_BINDING(0, 2) RWTexture2D<float2> AvgLog2Luminance; | ||
|
||
static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY; | ||
|
||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
groupshared float lds_sum_luma_log2; | ||
groupshared float lds_sum_weight; | ||
#else | ||
groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount]; | ||
groupshared float lds_weight[ThreadCount / MinWaveLaneCount]; | ||
#endif | ||
|
||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
[numthreads(ExposureThreadCountX, ExposureThreadCountY, 1)] | ||
#else | ||
[numthreads(ThreadCount, 1, 1)] | ||
#endif | ||
void main(uint3 gtid : SV_GroupThreadID, | ||
uint3 gid : SV_GroupID, | ||
uint3 dtid : SV_DispatchThreadID, | ||
uint gi : SV_GroupIndex) | ||
{ | ||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
if (gi == 0) | ||
{ | ||
lds_sum_luma_log2 = 0.f; | ||
lds_sum_weight = 0.f; | ||
} | ||
|
||
GroupMemoryBarrierWithGroupSync(); | ||
#endif | ||
|
||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
uint2 position_ts = dtid.xy; | ||
#else | ||
const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY); | ||
uint2 position_ts = offset_ts + decode_morton_2d(gi); | ||
#endif | ||
|
||
position_ts = position_ts * 2 + 1; | ||
|
||
const bool is_active = all(position_ts < Consts.extent_ts); | ||
|
||
float sum_luma_log2 = 0.f; | ||
float sum_weight = 0.f; | ||
|
||
if (is_active) | ||
{ | ||
float2 position_uv = (float2)position_ts * Consts.extent_ts_inv; | ||
|
||
// We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions. | ||
// This introduces a small inaccuracy that I'm ignoring. | ||
// Hopefully none of those inaccuracies matter when using dynamic resolution. | ||
float4 quad_r = SceneHDR.GatherRed(LinearClampSampler, position_uv); | ||
float4 quad_g = SceneHDR.GatherGreen(LinearClampSampler, position_uv); | ||
float4 quad_b = SceneHDR.GatherBlue(LinearClampSampler, position_uv); | ||
|
||
float3 quad01_scene_color_srgb_linear = float3(quad_r.x, quad_g.x, quad_b.x); | ||
float3 quad11_scene_color_srgb_linear = float3(quad_r.y, quad_g.y, quad_b.y); | ||
float3 quad10_scene_color_srgb_linear = float3(quad_r.z, quad_g.z, quad_b.z); | ||
float3 quad00_scene_color_srgb_linear = float3(quad_r.w, quad_g.w, quad_b.w); | ||
|
||
float4 luma_log2 = log2(max(0.00001f, float4( | ||
luma_srgb(quad01_scene_color_srgb_linear), | ||
luma_srgb(quad11_scene_color_srgb_linear), | ||
luma_srgb(quad10_scene_color_srgb_linear), | ||
luma_srgb(quad00_scene_color_srgb_linear)))); | ||
|
||
sum_luma_log2 = dot(luma_log2, 1.f / 4.f); | ||
sum_weight = 1.f; | ||
} | ||
|
||
sum_luma_log2 = WaveActiveSum(sum_luma_log2); | ||
sum_weight = WaveActiveSum(sum_weight); | ||
|
||
uint wave_lane_count = WaveGetLaneCount(); | ||
|
||
if (WaveIsFirstLane()) | ||
{ | ||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2); | ||
InterlockedAdd(lds_sum_weight, sum_weight); | ||
#else | ||
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT | ||
uint wave_index = gi / wave_lane_count; | ||
lds_luma_log2[wave_index] = sum_luma_log2; | ||
lds_weight[wave_index] = sum_weight; | ||
#endif | ||
} | ||
|
||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
GroupMemoryBarrierWithGroupSync(); | ||
#else | ||
// Manual LDS reduce pass | ||
uint active_threads = ThreadCount / wave_lane_count; | ||
|
||
for (uint threads = active_threads / 2; threads > 0; threads /= 2) | ||
{ | ||
GroupMemoryBarrierWithGroupSync(); | ||
|
||
if (gi < threads) | ||
{ | ||
sum_luma_log2 += lds_luma_log2[gi + threads]; | ||
sum_weight += lds_weight[gi + threads]; | ||
|
||
lds_luma_log2[gi] = sum_luma_log2; | ||
lds_weight[gi] = sum_weight; | ||
} | ||
} | ||
#endif | ||
|
||
if (gi == 0) | ||
{ | ||
uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY); | ||
uint2 output_position_ts = position_ts / (group_size * 2); | ||
|
||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
sum_luma_log2 = lds_sum_luma_log2; | ||
sum_weight = lds_sum_weight; | ||
#else | ||
// NOTE: data is already in the register from LDS | ||
#endif | ||
|
||
AvgLog2Luminance[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
//////////////////////////////////////////////////////////////////////////////// | ||
/// Reaper | ||
/// | ||
/// Copyright (c) 2015-2024 Thibault Schueller | ||
/// This file is distributed under the MIT License | ||
//////////////////////////////////////////////////////////////////////////////// | ||
|
||
#ifndef REDUCE_EXPOSURE_SHARE_INCLUDED | ||
#define REDUCE_EXPOSURE_SHARE_INCLUDED | ||
|
||
#include "shared_types.hlsl" | ||
|
||
static const hlsl_uint MinWaveLaneCount = 8; // FIXME | ||
|
||
static const hlsl_uint ExposureThreadCountX = 8; | ||
static const hlsl_uint ExposureThreadCountY = 8; | ||
|
||
struct ReduceExposurePassParams | ||
{ | ||
hlsl_uint2 extent_ts; | ||
hlsl_float2 extent_ts_inv; | ||
}; | ||
|
||
struct ReduceExposureTailPassParams | ||
{ | ||
hlsl_uint2 extent_ts; | ||
hlsl_float2 extent_ts_inv; | ||
hlsl_uint2 tail_extent_ts; | ||
hlsl_uint last_thread_group_index; | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
#include "lib/base.hlsl" | ||
|
||
#include "lib/color_space.hlsl" | ||
#include "lib/morton.hlsl" | ||
|
||
#include "reduce_exposure.share.hlsl" | ||
|
||
VK_PUSH_CONSTANT_HELPER(ReduceExposureTailPassParams) Consts; | ||
|
||
VK_BINDING(0, 0) SamplerState LinearClampSampler; | ||
VK_BINDING(0, 1) Texture2D<float2> AvgLog2LuminanceInput; | ||
VK_BINDING(0, 2) RWByteAddressBuffer AvgLog2LuminanceBuffer; | ||
VK_BINDING(0, 3) globallycoherent RWByteAddressBuffer TailCounter; | ||
VK_BINDING(0, 4) globallycoherent RWTexture2D<float2> AvgLog2LuminanceTail; | ||
|
||
static const uint ThreadCount = ExposureThreadCountX * ExposureThreadCountY; | ||
|
||
groupshared float lds_luma_log2[ThreadCount / MinWaveLaneCount]; | ||
groupshared float lds_weight[ThreadCount / MinWaveLaneCount]; | ||
groupshared uint lds_thread_group_index; | ||
|
||
[numthreads(ThreadCount, 1, 1)] | ||
void main(uint3 gtid : SV_GroupThreadID, | ||
uint3 gid : SV_GroupID, | ||
uint3 dtid : SV_DispatchThreadID, | ||
uint gi : SV_GroupIndex) | ||
{ | ||
const uint2 offset_ts = gid.xy * uint2(ExposureThreadCountX, ExposureThreadCountY); | ||
uint2 position_ts = offset_ts + decode_morton_2d(gi); | ||
|
||
position_ts = position_ts * 2 + 1; | ||
|
||
const bool is_active = all(position_ts < Consts.extent_ts); | ||
|
||
float sum_luma_log2 = 0.f; | ||
float sum_weight = 0.f; | ||
|
||
if (is_active) | ||
{ | ||
float2 position_uv = (float2)position_ts * Consts.extent_ts_inv; | ||
|
||
// We're not taking into account invalid pixels that would be gathered outside of the range when using even resolutions. | ||
// This introduces a small inaccuracy that I'm ignoring. | ||
// Hopefully none of those inaccuracies matter when using dynamic resolution. | ||
// FIXME This might matter for the tail! | ||
float2 luma_weight = AvgLog2LuminanceInput.Sample(LinearClampSampler, position_uv); | ||
|
||
sum_luma_log2 = luma_weight.x; | ||
sum_weight = luma_weight.y; | ||
} | ||
|
||
sum_luma_log2 = WaveActiveSum(sum_luma_log2); | ||
sum_weight = WaveActiveSum(sum_weight); | ||
|
||
uint wave_lane_count = WaveGetLaneCount(); | ||
|
||
if (WaveIsFirstLane()) | ||
{ | ||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
// FIXME this needs support for atomic adds and that's not supported on my Intel iGPU | ||
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2); | ||
InterlockedAdd(lds_sum_weight, sum_weight); | ||
#else | ||
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT | ||
uint wave_index = gi / wave_lane_count; | ||
lds_luma_log2[wave_index] = sum_luma_log2; | ||
lds_weight[wave_index] = sum_weight; | ||
#endif | ||
} | ||
|
||
// Manual LDS reduce pass | ||
uint active_threads = ThreadCount / wave_lane_count; | ||
|
||
for (uint threads = active_threads / 2; threads > 0; threads /= 2) | ||
{ | ||
GroupMemoryBarrierWithGroupSync(); | ||
|
||
if (gi < threads) | ||
{ | ||
sum_luma_log2 += lds_luma_log2[gi + threads]; | ||
sum_weight += lds_weight[gi + threads]; | ||
|
||
lds_luma_log2[gi] = sum_luma_log2; | ||
lds_weight[gi] = sum_weight; | ||
} | ||
} | ||
|
||
if (gi == 0) | ||
{ | ||
uint2 group_size = uint2(ExposureThreadCountX, ExposureThreadCountY); | ||
uint2 output_position_ts = position_ts / (group_size * 2); | ||
|
||
// NOTE: data is already in the register from LDS | ||
AvgLog2LuminanceTail[output_position_ts] = float2(sum_luma_log2 / sum_weight, sum_weight / float(ThreadCount)); | ||
|
||
// Like AMD's SPD code, we're keeping the last thread group alive to make another go at the rest of the tail. | ||
TailCounter.InterlockedAdd(0, uint(1), lds_thread_group_index); | ||
} | ||
|
||
GroupMemoryBarrierWithGroupSync(); | ||
|
||
// Exit if we're not the last thread group running | ||
if (lds_thread_group_index != Consts.last_thread_group_index) | ||
return; | ||
|
||
// ========================================================================= | ||
// FIXME reduce the rest of the mip chain | ||
|
||
position_ts = decode_morton_2d(gi); // FIXME | ||
|
||
const bool is_active2 = all(position_ts < Consts.tail_extent_ts); // FIXME | ||
|
||
sum_luma_log2 = 0.f; // FIXME | ||
sum_weight = 0.f; // FIXME | ||
|
||
if (is_active2) | ||
{ | ||
float2 luma_weight = AvgLog2LuminanceTail[position_ts]; | ||
|
||
sum_luma_log2 = luma_weight.x; | ||
sum_weight = luma_weight.y; | ||
} | ||
|
||
sum_luma_log2 = WaveActiveSum(sum_luma_log2); | ||
sum_weight = WaveActiveSum(sum_weight); | ||
|
||
// uint wave_lane_count = WaveGetLaneCount(); | ||
|
||
if (WaveIsFirstLane()) | ||
{ | ||
#if ENABLE_SHARED_FLOAT_ATOMICS | ||
// FIXME this needs support for atomic adds and that's not supported on my Intel iGPU | ||
InterlockedAdd(lds_sum_luma_log2, sum_luma_log2); | ||
InterlockedAdd(lds_sum_weight, sum_weight); | ||
#else | ||
// NOTE: Only valid with VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT | ||
uint wave_index = gi / wave_lane_count; | ||
lds_luma_log2[wave_index] = sum_luma_log2; | ||
lds_weight[wave_index] = sum_weight; | ||
#endif | ||
} | ||
|
||
// Manual LDS reduce pass | ||
//uint active_threads = ThreadCount / wave_lane_count; | ||
|
||
for (uint threads = active_threads / 2; threads > 0; threads /= 2) | ||
{ | ||
GroupMemoryBarrierWithGroupSync(); | ||
|
||
if (gi < threads) | ||
{ | ||
sum_luma_log2 += lds_luma_log2[gi + threads]; | ||
sum_weight += lds_weight[gi + threads]; | ||
|
||
lds_luma_log2[gi] = sum_luma_log2; | ||
lds_weight[gi] = sum_weight; | ||
} | ||
} | ||
|
||
// ========================================================================= | ||
|
||
if (gi == 0) | ||
{ | ||
// NOTE: data is already in the register from LDS | ||
float average_luma_log = sum_luma_log2 / sum_weight; | ||
|
||
AvgLog2LuminanceBuffer.Store(0, asuint(average_luma_log)); | ||
|
||
// Clear counter for next passes | ||
TailCounter.Store(0, 0); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.