Skip to content

Commit

Permalink
vulkan: use 8bit indices for meshlet draws
Browse files Browse the repository at this point in the history
Require the following vulkan extensions:
- VK_EXT_index_type_uint8
- VK_EXT_primitive_topology_list_restart

This lets us pack triangles indices in 32 bits.
We use the last 8 bits as padding and insert a primitive restart there.

Fixes NPT-71
  • Loading branch information
Ryp committed Aug 8, 2023
1 parent 30f6752 commit 1aaa5ae
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 13 deletions.
29 changes: 29 additions & 0 deletions src/renderer/shader/lib/format/bitfield.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,33 @@ uint bitfield_extract(uint bitfield, uint first_bit_offset, uint bit_count)
return (bitfield >> first_bit_offset) & mask;
}

uint merge_uint_4x8_to_32(uint4 uint_4x8)
{
uint uint_32 = uint_4x8.x
+ (uint_4x8.y << 8)
+ (uint_4x8.z << 16)
+ (uint_4x8.w << 24);

return uint_32;
}

uint3 split_uint_32_to_3x8(uint uint_32)
{
uint3 uint_3x8 = uint3(uint_32,
uint_32 >> 8,
uint_32 >> 16);

return uint_3x8 & 0xFF;
}

uint4 split_uint_32_to_4x8(uint uint_32)
{
uint4 uint_4x8 = uint4(uint_32,
uint_32 >> 8,
uint_32 >> 16,
uint_32 >> 24);

return uint_4x8 & 0xFF;
}

#endif
13 changes: 8 additions & 5 deletions src/renderer/shader/meshlet/cull_triangle_batch.comp.hlsl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "lib/base.hlsl"
#include "lib/indirect_command.hlsl"
#include "lib/vertex_pull.hlsl"
#include "lib/format/bitfield.hlsl"

#include "meshlet.share.hlsl"
#include "meshlet_culling.share.hlsl"
Expand Down Expand Up @@ -52,7 +53,7 @@ void main(uint3 gtid : SV_GroupThreadID,

const uint input_triangle_index_offset = meshlet.index_offset + gtid.x * 3;

const uint3 indices = Indices.Load3(input_triangle_index_offset * MeshletIndexSizeBytes);
const uint3 indices = Indices.Load3(input_triangle_index_offset * 4);
const uint3 indices_with_vertex_offset = indices + meshlet.vertex_offset;

// NOTE: We will read out of bounds, this might be wasteful - or even illegal. OOB reads in DirectX11 are defined to return zero, what about Vulkan?
Expand Down Expand Up @@ -143,7 +144,9 @@ void main(uint3 gtid : SV_GroupThreadID,

if (is_visible)
{
visible_index_buffer.Store3(output_triangle_index * 3 * 4, indices);
// Add extra dummy index to align all triangle indices on a 32bit boundary
uint packed_indices = merge_uint_4x8_to_32(uint4(indices, 0xff));
visible_index_buffer.Store(output_triangle_index * 4, packed_indices);
}

if (gi == 0 && lds_triangle_count > 0)
Expand All @@ -153,9 +156,9 @@ void main(uint3 gtid : SV_GroupThreadID,
Counters.InterlockedAdd(DrawCommandCounterOffset * 4, 1u, draw_command_index);

DrawIndexedIndirectCommand command;
command.indexCount = lds_triangle_count * 3;
command.indexCount = lds_triangle_count * 4; // NOTE: There's 4 indices for each triangle!
command.instanceCount = 1;
command.firstIndex = lds_triangle_offset * 3;
command.firstIndex = lds_triangle_offset * 4;

if (consts.main_pass)
{
Expand All @@ -174,7 +177,7 @@ void main(uint3 gtid : SV_GroupThreadID,
{
VisibleMeshlet visible_meshlet;
visible_meshlet.mesh_instance_id = mesh_instance.instance_id;
visible_meshlet.visible_index_offset = lds_triangle_offset * 3;
visible_meshlet.visible_triangle_offset = lds_triangle_offset;
visible_meshlet.vertex_offset = meshlet.vertex_offset;

VisibleMeshlets[draw_command_index] = visible_meshlet;
Expand Down
3 changes: 1 addition & 2 deletions src/renderer/shader/meshlet/meshlet.share.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "shared_types.hlsl"

static const hlsl_uint MeshletMaxTriangleCount = 64;
static const hlsl_uint MeshletIndexSizeBytes = 4;

struct Meshlet
{
Expand All @@ -30,7 +29,7 @@ struct Meshlet
struct VisibleMeshlet
{
hlsl_uint mesh_instance_id;
hlsl_uint visible_index_offset;
hlsl_uint visible_triangle_offset;
hlsl_uint vertex_offset;
hlsl_float _pad;
};
Expand Down
7 changes: 4 additions & 3 deletions src/renderer/shader/vis_buffer/fill_gbuffer.comp.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "lib/brdf.hlsl"
#include "lib/barycentrics.hlsl"
#include "lib/vertex_pull.hlsl"
#include "lib/format/bitfield.hlsl"
#include "gbuffer/gbuffer.hlsl"
#include "meshlet/meshlet.share.hlsl"
#include "forward.share.hlsl" // FIXME
Expand Down Expand Up @@ -58,9 +59,9 @@ void main(uint3 gtid : SV_GroupThreadID,

VisibleMeshlet visible_meshlet = visible_meshlets[vis_buffer.visible_meshlet_index];

uint visible_index_offset = visible_meshlet.visible_index_offset + vis_buffer.meshlet_triangle_id * 3;
uint3 indices = visible_index_buffer.Load3(visible_index_offset * MeshletIndexSizeBytes);
indices += visible_meshlet.vertex_offset;
uint visible_index_offset = visible_meshlet.visible_triangle_offset + vis_buffer.meshlet_triangle_id;
uint packed_indices = visible_index_buffer.Load(visible_index_offset * 4);
uint3 indices = split_uint_32_to_3x8(packed_indices) + visible_meshlet.vertex_offset;

VertexData p0;
VertexData p1;
Expand Down
22 changes: 22 additions & 0 deletions src/renderer/vulkan/Backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ void create_vulkan_renderer_backend(ReaperRoot& root, VulkanBackend& backend)
VK_KHR_SWAPCHAIN_EXTENSION_NAME,
VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME,
VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME,
VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME,
#if 0
VK_EXT_HDR_METADATA_EXTENSION_NAME,
VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
Expand Down Expand Up @@ -471,8 +473,16 @@ bool vulkan_check_physical_device(IWindow* window,
vkGetPhysicalDeviceProperties2(physical_device, &device_properties2);
VkPhysicalDeviceProperties& device_properties = device_properties2.properties;

VkPhysicalDeviceIndexTypeUint8FeaturesEXT index_uint8_feature = {};
index_uint8_feature.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT;

VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT primitive_restart_feature = {};
primitive_restart_feature.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT;
primitive_restart_feature.pNext = &index_uint8_feature;

VkPhysicalDeviceDescriptorIndexingFeatures descriptor_indexing_features = {};
descriptor_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES;
descriptor_indexing_features.pNext = &primitive_restart_feature;

VkPhysicalDeviceVulkan13Features device_vulkan13_features = {};
device_vulkan13_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES;
Expand All @@ -497,6 +507,8 @@ bool vulkan_check_physical_device(IWindow* window,
Assert(device_vulkan13_features.synchronization2 == VK_TRUE);
Assert(device_vulkan13_features.dynamicRendering == VK_TRUE);
Assert(device_vulkan12_features.shaderSampledImageArrayNonUniformIndexing == VK_TRUE);
Assert(primitive_restart_feature.primitiveTopologyListRestart == VK_TRUE);
Assert(index_uint8_feature.indexTypeUint8 == VK_TRUE);

uint32_t queue_families_count = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physical_device, &queue_families_count, nullptr);
Expand Down Expand Up @@ -726,8 +738,18 @@ void vulkan_create_logical_device(ReaperRoot& root,
deviceFeatures.fillModeNonSolid = VK_TRUE;
deviceFeatures.geometryShader = VK_TRUE;

VkPhysicalDeviceIndexTypeUint8FeaturesEXT index_uint8_feature = {};
index_uint8_feature.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT;
index_uint8_feature.indexTypeUint8 = VK_TRUE;

VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT primitive_restart_feature = {};
primitive_restart_feature.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT;
primitive_restart_feature.pNext = &index_uint8_feature;
primitive_restart_feature.primitiveTopologyListRestart = VK_TRUE;

VkPhysicalDeviceVulkan13Features device_vulkan13_features = {};
device_vulkan13_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES;
device_vulkan13_features.pNext = &primitive_restart_feature;
device_vulkan13_features.synchronization2 = VK_TRUE;
device_vulkan13_features.dynamicRendering = VK_TRUE;

Expand Down
1 change: 1 addition & 0 deletions src/renderer/vulkan/renderpass/ForwardPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace
};

GraphicsPipelineProperties pipeline_properties = default_graphics_pipeline_properties(&feedback_info);
pipeline_properties.input_assembly.primitiveRestartEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthTestEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthWriteEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthCompareOp =
Expand Down
1 change: 1 addition & 0 deletions src/renderer/vulkan/renderpass/GBufferPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ namespace
};

GraphicsPipelineProperties pipeline_properties = default_graphics_pipeline_properties(&feedback_info);
pipeline_properties.input_assembly.primitiveRestartEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthTestEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthWriteEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthCompareOp =
Expand Down
10 changes: 7 additions & 3 deletions src/renderer/vulkan/renderpass/MeshletCulling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@

namespace Reaper
{
constexpr u32 IndexSizeBytes = 4;
constexpr u32 IndexSizeBytes = 1;
// NOTE: Because of u8 indices we pack a triangle in 24 bits + 8 bits for a prim restart
constexpr u32 TriangleIndicesSizeBytes = 4;
constexpr u32 MaxMeshletCullingPassCount = 4;
constexpr u32 MaxMeshInstanceCount = 512;
// NOTE: Increasing this seems to make perf degrade noticeably on my intel iGPU for the same amount of geometry drawn.
Expand All @@ -39,7 +41,7 @@ constexpr u32 MaxVisibleMeshletsPerPass = 4096;
// Worst case if all meshlets of all passes aren't culled.
// This shouldn't happen, we can probably cut this by half and raise a warning when we cross the limit.
constexpr u64 VisibleIndexBufferSizeBytes =
MaxVisibleMeshletsPerPass * MaxMeshletCullingPassCount * MeshletMaxTriangleCount * 3 * IndexSizeBytes;
MaxVisibleMeshletsPerPass * MaxMeshletCullingPassCount * MeshletMaxTriangleCount * TriangleIndicesSizeBytes;
constexpr u32 MaxIndirectDrawCountPerPass = MaxVisibleMeshletsPerPass;

MeshletCullingResources create_meshlet_culling_resources(ReaperRoot& root, VulkanBackend& backend,
Expand Down Expand Up @@ -507,7 +509,9 @@ namespace
{
VkIndexType get_vk_meshlet_index_type()
{
if (IndexSizeBytes == 2)
if (IndexSizeBytes == 1)
return VK_INDEX_TYPE_UINT8_EXT;
else if (IndexSizeBytes == 2)
return VK_INDEX_TYPE_UINT16;
else if (IndexSizeBytes == 4)
return VK_INDEX_TYPE_UINT32;
Expand Down
1 change: 1 addition & 0 deletions src/renderer/vulkan/renderpass/ShadowMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ ShadowMapResources create_shadow_map_resources(ReaperRoot& root, VulkanBackend&
create_pipeline_layout(backend.device, nonstd::span(&resources.pipe.descSetLayout, 1));

GraphicsPipelineProperties pipeline_properties = default_graphics_pipeline_properties();
pipeline_properties.input_assembly.primitiveRestartEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthTestEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthWriteEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthCompareOp = ShadowUseReverseZ ? VK_COMPARE_OP_GREATER : VK_COMPARE_OP_LESS;
Expand Down
1 change: 1 addition & 0 deletions src/renderer/vulkan/renderpass/VisibilityBufferPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ namespace
};

GraphicsPipelineProperties pipeline_properties = default_graphics_pipeline_properties(&feedback_info);
pipeline_properties.input_assembly.primitiveRestartEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthTestEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthWriteEnable = VK_TRUE;
pipeline_properties.depth_stencil.depthCompareOp =
Expand Down

0 comments on commit 1aaa5ae

Please sign in to comment.