From 2bf3d262468d8869877b620e5faf4da8b1f77d03 Mon Sep 17 00:00:00 2001 From: Brad Werth Date: Tue, 4 Jun 2024 10:11:03 -0700 Subject: [PATCH] Remove vertex_pulling_transfrom from PipelineCompilationOptions. This option was only evaluated for Metal backends, and now it's required there so the option is going away. It is still configurable for tests via the PipelineOptions struct, deserialized from .ron files. This also fixes some type problems with the unpack functions in writer.rs. Metal << operator extends operand to int-sized, which then has to be cast back down to the real size before as_type bit conversion. The math for the snorm values is corrected, in some cases using the metal unpack_snorm2x16_to_float function because we can't directly cast a bit-shifted ushort value to half. --- deno_webgpu/pipeline.rs | 3 - naga/CHANGELOG.md | 1 + naga/src/back/msl/mod.rs | 4 +- naga/src/back/msl/writer.rs | 66 +-- tests/tests/root.rs | 1 + tests/tests/vertex_formats/draw.vert.wgsl | 320 ++++++++++++++ tests/tests/vertex_formats/mod.rs | 393 ++++++++++++++++++ tests/tests/vertex_indices/mod.rs | 53 +-- wgpu-core/src/device/global.rs | 3 - wgpu-core/src/device/resource.rs | 3 - wgpu-core/src/pipeline.rs | 4 - wgpu-hal/examples/halmark/main.rs | 2 - wgpu-hal/examples/ray-traced-triangle/main.rs | 1 - wgpu-hal/src/lib.rs | 3 - wgpu-hal/src/metal/device.rs | 2 +- wgpu/src/backend/wgpu_core.rs | 6 - wgpu/src/lib.rs | 3 - 17 files changed, 764 insertions(+), 104 deletions(-) create mode 100644 tests/tests/vertex_formats/draw.vert.wgsl create mode 100644 tests/tests/vertex_formats/mod.rs diff --git a/deno_webgpu/pipeline.rs b/deno_webgpu/pipeline.rs index 75bd9b3ef27..a19ac4fa7ce 100644 --- a/deno_webgpu/pipeline.rs +++ b/deno_webgpu/pipeline.rs @@ -114,7 +114,6 @@ pub fn op_webgpu_create_compute_pipeline( entry_point: compute.entry_point.map(Cow::from), constants: Cow::Owned(compute.constants.unwrap_or_default()), zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }, cache: None, }; @@ -359,7 +358,6 @@ pub fn op_webgpu_create_render_pipeline( constants: Cow::Owned(fragment.constants.unwrap_or_default()), // Required to be true for WebGPU zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }, targets: Cow::Owned(fragment.targets), }) @@ -385,7 +383,6 @@ pub fn op_webgpu_create_render_pipeline( constants: Cow::Owned(args.vertex.constants.unwrap_or_default()), // Required to be true for WebGPU zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }, buffers: Cow::Owned(vertex_buffers), }, diff --git a/naga/CHANGELOG.md b/naga/CHANGELOG.md index 2a00f01f86d..49cde4e2123 100644 --- a/naga/CHANGELOG.md +++ b/naga/CHANGELOG.md @@ -81,6 +81,7 @@ For changelogs after v0.14, see [the wgpu changelog](../CHANGELOG.md). - Make varyings' struct members unique. ([#2521](https://github.com/gfx-rs/naga/pull/2521)) **@evahop** - Add experimental vertex pulling transform flag. ([#5254](https://github.com/gfx-rs/wgpu/pull/5254)) **@bradwerth** - Fixup some generated MSL for vertex buffer unpack functions. ([#5829](https://github.com/gfx-rs/wgpu/pull/5829)) **@bradwerth** +- Make vertex pulling transform on by default. ([#5773](https://github.com/gfx-rs/wgpu/pull/5773)) **@bradwerth** #### GLSL-OUT diff --git a/naga/src/back/msl/mod.rs b/naga/src/back/msl/mod.rs index 3b33ee7a716..626475debcd 100644 --- a/naga/src/back/msl/mod.rs +++ b/naga/src/back/msl/mod.rs @@ -354,7 +354,9 @@ pub struct PipelineOptions { /// to receive the vertex buffers, lengths, and vertex id as args, /// and bounds-check the vertex id and use the index into the /// vertex buffers to access attributes, rather than using Metal's - /// [[stage-in]] assembled attribute data. + /// [[stage-in]] assembled attribute data. This is true by default, + /// but remains configurable for use by tests via deserialization + /// of this struct. There is no user-facing way to set this value. pub vertex_pulling_transform: bool, /// vertex_buffer_mappings are used during shader translation to diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs index 8b868970078..6287bbe278e 100644 --- a/naga/src/back/msl/writer.rs +++ b/naga/src/back/msl/writer.rs @@ -3953,8 +3953,8 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::float2((float(b0) - 128.0f) / 255.0f, \ - (float(b1) - 128.0f) / 255.0f);", + "{}return metal::float2(metal::max(-1.0f, as_type(b0) / 127.0f), \ + metal::max(-1.0f, as_type(b1) / 127.0f));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -3971,10 +3971,10 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::float4((float(b0) - 128.0f) / 255.0f, \ - (float(b1) - 128.0f) / 255.0f, \ - (float(b2) - 128.0f) / 255.0f, \ - (float(b3) - 128.0f) / 255.0f);", + "{}return metal::float4(metal::max(-1.0f, as_type(b0) / 127.0f), \ + metal::max(-1.0f, as_type(b1) / 127.0f), \ + metal::max(-1.0f, as_type(b2) / 127.0f), \ + metal::max(-1.0f, as_type(b3) / 127.0f));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4033,8 +4033,8 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::int2(as_type(b1 << 8 | b0), \ - as_type(b3 << 8 | b2));", + "{}return metal::int2(as_type(metal::ushort(b1 << 8 | b0)), \ + as_type(metal::ushort(b3 << 8 | b2)));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4055,10 +4055,10 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::int4(as_type(b1 << 8 | b0), \ - as_type(b3 << 8 | b2), \ - as_type(b5 << 8 | b4), \ - as_type(b7 << 8 | b6));", + "{}return metal::int4(as_type(metal::ushort(b1 << 8 | b0)), \ + as_type(metal::ushort(b3 << 8 | b2)), \ + as_type(metal::ushort(b5 << 8 | b4)), \ + as_type(metal::ushort(b7 << 8 | b6)));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4117,8 +4117,7 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::float2((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \ - (float(b3 << 8 | b2) - 32767.0f) / 65535.0f);", + "{}return metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2);", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4139,10 +4138,8 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::float4((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \ - (float(b3 << 8 | b2) - 32767.0f) / 65535.0f, \ - (float(b5 << 8 | b4) - 32767.0f) / 65535.0f, \ - (float(b7 << 8 | b6) - 32767.0f) / 65535.0f);", + "{}return metal::float4(metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2), \ + metal::unpack_snorm2x16_to_float(b5 << 24 | b4 << 16 | b7 << 8 | b6));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4159,8 +4156,8 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::float2(as_type(b1 << 8 | b0), \ - as_type(b3 << 8 | b2));", + "{}return metal::float2(as_type(metal::ushort(b1 << 8 | b0)), \ + as_type(metal::ushort(b3 << 8 | b2)));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4181,10 +4178,10 @@ impl Writer { )?; writeln!( self.out, - "{}return metal::int4(as_type(b1 << 8 | b0), \ - as_type(b3 << 8 | b2), \ - as_type(b5 << 8 | b4), \ - as_type(b7 << 8 | b6));", + "{}return metal::int4(as_type(metal::ushort(b1 << 8 | b0)), \ + as_type(metal::ushort(b3 << 8 | b2)), \ + as_type(metal::ushort(b5 << 8 | b4)), \ + as_type(metal::ushort(b7 << 8 | b6)));", back::INDENT )?; writeln!(self.out, "}}")?; @@ -4390,10 +4387,10 @@ impl Writer { let name = self.namer.call("unpackSint32"); writeln!( self.out, - "metal::int {name}(uint b0, \ - uint b1, \ - uint b2, \ - uint b3) {{" + "int {name}(uint b0, \ + uint b1, \ + uint b2, \ + uint b3) {{" )?; writeln!( self.out, @@ -4495,7 +4492,18 @@ impl Writer { )?; writeln!( self.out, - "{}return unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);", + // The following is correct for RGBA packing, but our format seems to + // match ABGR, which can be fed into the Metal builtin function + // unpack_unorm10a2_to_float. + /* + "{}uint v = (b3 << 24 | b2 << 16 | b1 << 8 | b0); \ + uint r = (v & 0xFFC00000) >> 22; \ + uint g = (v & 0x003FF000) >> 12; \ + uint b = (v & 0x00000FFC) >> 2; \ + uint a = (v & 0x00000003); \ + return metal::float4(float(r) / 1023.0f, float(g) / 1023.0f, float(b) / 1023.0f, float(a) / 3.0f);", + */ + "{}return metal::unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);", back::INDENT )?; writeln!(self.out, "}}")?; diff --git a/tests/tests/root.rs b/tests/tests/root.rs index 6ceb3818df8..384cfcf78fc 100644 --- a/tests/tests/root.rs +++ b/tests/tests/root.rs @@ -42,6 +42,7 @@ mod subgroup_operations; mod texture_bounds; mod texture_view_creation; mod transfer; +mod vertex_formats; mod vertex_indices; mod write_texture; mod zero_init_texture_after_discard; diff --git a/tests/tests/vertex_formats/draw.vert.wgsl b/tests/tests/vertex_formats/draw.vert.wgsl new file mode 100644 index 00000000000..39d46526407 --- /dev/null +++ b/tests/tests/vertex_formats/draw.vert.wgsl @@ -0,0 +1,320 @@ +@group(0) @binding(0) +var checksums: array; + +const index_uint = 0u; +const index_sint = 1u; +const index_unorm = 2u; +const index_snorm = 3u; +const index_float16 = 4u; +const index_float32 = 5u; + +fn init_checksums() { + checksums[index_uint] = 0.0; + checksums[index_sint] = 0.0; + checksums[index_unorm] = 0.0; + checksums[index_snorm] = 0.0; + checksums[index_float16] = 0.0; + checksums[index_float32] = 0.0; +} + +// Break down the 31 vertex formats specified at +// https://gpuweb.github.io/gpuweb/#vertex-formats into blocks +// of 8, to keep under the limits of max locations. Each +// AttributeBlockX structure will get a corresponding +// vertex_block_X function to process its attributes into +// values written to the checksums buffer. + +struct AttributeBlock0 { + // 4-byte-aligned unorm formats + @location(0) unorm8x4: vec4, + @location(1) unorm16x2: vec2, + @location(2) unorm16x4: vec4, + + // 4-byte-aligned snorm formats + @location(3) snorm8x4: vec4, + @location(4) snorm16x2: vec2, + @location(5) snorm16x4: vec4, + + // 2-byte-aligned formats + @location(6) unorm8x2: vec2, + @location(7) snorm8x2: vec2, +} + +@vertex +fn vertex_block_0(v_in: AttributeBlock0) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all unorm into one checksum value. + var all_unorm: f32 = 0.0; + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.x); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.y); + + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.x); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.y); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.z); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.w); + + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.x); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.y); + + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.x); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.y); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.z); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.w); + + checksums[index_unorm] = f32(all_unorm); + + // Accumulate all snorm into one checksum value. + var all_snorm: f32 = 0.0; + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.x); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.y); + + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.x); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.y); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.z); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.w); + + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.x); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.y); + + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.x); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.y); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.z); + all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.w); + + checksums[index_snorm] = f32(all_snorm); + + return vec4(0.0); +} + +struct AttributeBlock1 { + // 4-byte-aligned uint formats + @location(0) uint8x4: vec4, + @location(1) uint16x2: vec2, + @location(2) uint16x4: vec4, + + // 4-byte-aligned sint formats + @location(3) sint8x4: vec4, + @location(4) sint16x2: vec2, + @location(5) sint16x4: vec4, + + // 2-byte-aligned formats + @location(6) uint8x2: vec2, + @location(7) sint8x2: vec2, +} + +@vertex +fn vertex_block_1(v_in: AttributeBlock1) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all uint into one checksum value. + var all_uint: u32 = 0; + all_uint = accumulate_uint(all_uint, v_in.uint8x2.x); + all_uint = accumulate_uint(all_uint, v_in.uint8x2.y); + + all_uint = accumulate_uint(all_uint, v_in.uint8x4.x); + all_uint = accumulate_uint(all_uint, v_in.uint8x4.y); + all_uint = accumulate_uint(all_uint, v_in.uint8x4.z); + all_uint = accumulate_uint(all_uint, v_in.uint8x4.w); + + all_uint = accumulate_uint(all_uint, v_in.uint16x2.x); + all_uint = accumulate_uint(all_uint, v_in.uint16x2.y); + + all_uint = accumulate_uint(all_uint, v_in.uint16x4.x); + all_uint = accumulate_uint(all_uint, v_in.uint16x4.y); + all_uint = accumulate_uint(all_uint, v_in.uint16x4.z); + all_uint = accumulate_uint(all_uint, v_in.uint16x4.w); + + checksums[index_uint] = f32(all_uint); + + // Accumulate all sint into one checksum value. + var all_sint: i32 = 0; + all_sint = accumulate_sint(all_sint, v_in.sint8x2.x); + all_sint = accumulate_sint(all_sint, v_in.sint8x2.y); + + all_sint = accumulate_sint(all_sint, v_in.sint8x4.x); + all_sint = accumulate_sint(all_sint, v_in.sint8x4.y); + all_sint = accumulate_sint(all_sint, v_in.sint8x4.z); + all_sint = accumulate_sint(all_sint, v_in.sint8x4.w); + + all_sint = accumulate_sint(all_sint, v_in.sint16x2.x); + all_sint = accumulate_sint(all_sint, v_in.sint16x2.y); + + all_sint = accumulate_sint(all_sint, v_in.sint16x4.x); + all_sint = accumulate_sint(all_sint, v_in.sint16x4.y); + all_sint = accumulate_sint(all_sint, v_in.sint16x4.z); + all_sint = accumulate_sint(all_sint, v_in.sint16x4.w); + + checksums[index_sint] = f32(all_sint); + + return vec4(0.0); +} + +struct AttributeBlock2 { + @location(0) uint32: u32, + @location(1) uint32x2: vec2, + @location(2) uint32x3: vec3, + @location(3) uint32x4: vec4, +} + +@vertex +fn vertex_block_2(v_in: AttributeBlock2) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all uint into one checksum value. + var all_uint: u32 = 0; + all_uint = accumulate_uint(all_uint, v_in.uint32); + + all_uint = accumulate_uint(all_uint, v_in.uint32x2.x); + all_uint = accumulate_uint(all_uint, v_in.uint32x2.y); + + all_uint = accumulate_uint(all_uint, v_in.uint32x3.x); + all_uint = accumulate_uint(all_uint, v_in.uint32x3.y); + all_uint = accumulate_uint(all_uint, v_in.uint32x3.z); + + all_uint = accumulate_uint(all_uint, v_in.uint32x4.x); + all_uint = accumulate_uint(all_uint, v_in.uint32x4.y); + all_uint = accumulate_uint(all_uint, v_in.uint32x4.z); + all_uint = accumulate_uint(all_uint, v_in.uint32x4.w); + + checksums[index_uint] = f32(all_uint); + + return vec4(0.0); +} + +struct AttributeBlock3 { + @location(0) sint32: i32, + @location(1) sint32x2: vec2, + @location(2) sint32x3: vec3, + @location(3) sint32x4: vec4, +} + +@vertex +fn vertex_block_3(v_in: AttributeBlock3) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all sint into one checksum value. + var all_sint: i32 = 0; + all_sint = accumulate_sint(all_sint, v_in.sint32); + + all_sint = accumulate_sint(all_sint, v_in.sint32x2.x); + all_sint = accumulate_sint(all_sint, v_in.sint32x2.y); + + all_sint = accumulate_sint(all_sint, v_in.sint32x3.x); + all_sint = accumulate_sint(all_sint, v_in.sint32x3.y); + all_sint = accumulate_sint(all_sint, v_in.sint32x3.z); + + all_sint = accumulate_sint(all_sint, v_in.sint32x4.x); + all_sint = accumulate_sint(all_sint, v_in.sint32x4.y); + all_sint = accumulate_sint(all_sint, v_in.sint32x4.z); + all_sint = accumulate_sint(all_sint, v_in.sint32x4.w); + + checksums[index_sint] = f32(all_sint); + + return vec4(0.0); +} + +struct AttributeBlock4{ + @location(0) float32: f32, + @location(1) float32x2: vec2, + @location(2) float32x3: vec3, + @location(3) float32x4: vec4, + // TODO(SHADER_F16) + /* + @location(4) float16x2: vec2, + @location(5) float16x4: vec4, + */ +} + +@vertex +fn vertex_block_4(v_in: AttributeBlock4) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all float32 into one checksum value. + var all_float32: f32 = 0.0; + all_float32 = accumulate_float32(all_float32, v_in.float32); + + all_float32 = accumulate_float32(all_float32, v_in.float32x2.x); + all_float32 = accumulate_float32(all_float32, v_in.float32x2.y); + + all_float32 = accumulate_float32(all_float32, v_in.float32x3.x); + all_float32 = accumulate_float32(all_float32, v_in.float32x3.y); + all_float32 = accumulate_float32(all_float32, v_in.float32x3.z); + + all_float32 = accumulate_float32(all_float32, v_in.float32x4.x); + all_float32 = accumulate_float32(all_float32, v_in.float32x4.y); + all_float32 = accumulate_float32(all_float32, v_in.float32x4.z); + all_float32 = accumulate_float32(all_float32, v_in.float32x4.w); + + checksums[index_float32] = f32(all_float32); + + // TODO(SHADER_F16) + /* + // Accumulate all float16 into one checksum value. + var all_float16: f16 = 0.0; + all_float16 = accumulate_float16(all_float16, v_in.float16x2.x); + all_float16 = accumulate_float16(all_float16, v_in.float16x2.y); + + checksums[index_float16] = f32(all_float16); + */ + + return vec4(0.0); +} + +struct AttributeBlock5{ + @location(0) unorm10_10_10_2: vec4, +} + +@vertex +fn vertex_block_5(v_in: AttributeBlock5) -> @builtin(position) vec4 +{ + init_checksums(); + + // Accumulate all unorm into one checksum value. + var all_unorm: f32 = 0.0; + all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.x); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.y); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.z); + all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.w); + + checksums[index_unorm] = f32(all_unorm); + + return vec4(0.0); +} + +fn accumulate_uint(accum: u32, val: u32) -> u32 { + return accum + val; +} + +fn accumulate_sint(accum: i32, val: i32) -> i32 { + return accum + val; +} + +fn accumulate_unorm(accum: f32, val: f32) -> f32 { + return accum + val; +} + +fn accumulate_snorm(accum: f32, val: f32) -> f32 { + return accum + val; +} + +// TODO(SHADER_F16) +/* +fn accumulate_float16(accum: f16, val: f16) -> f16 { + return accum + val; +} +*/ + +fn accumulate_float32(accum: f32, val: f32) -> f32 { + return accum + val; +} + +@fragment +fn fragment_main() -> @location(0) vec4 { + return vec4(0.0); +} diff --git a/tests/tests/vertex_formats/mod.rs b/tests/tests/vertex_formats/mod.rs new file mode 100644 index 00000000000..d35e889bdec --- /dev/null +++ b/tests/tests/vertex_formats/mod.rs @@ -0,0 +1,393 @@ +//! Tests that vertex formats pass through to vertex shaders accurately. + +use std::num::NonZeroU64; + +use wgpu::util::{BufferInitDescriptor, DeviceExt}; + +use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext}; + +#[derive(Debug, Copy, Clone)] +enum TestCase { + UnormsAndSnorms, + UintsAndSintsSmall, + UintsBig, + SintsBig, + Floats, + Unorm1010102, +} + +struct Test<'a> { + case: TestCase, + entry_point: &'a str, + attributes: &'a [wgt::VertexAttribute], + input: &'a [u8], + checksums: &'a [f32], +} + +async fn vertex_formats_all(ctx: TestingContext) { + let attributes_block_0 = &wgpu::vertex_attr_array![ + 0 => Unorm8x4, + 1 => Unorm16x2, + 2 => Unorm16x4, + 3 => Snorm8x4, + 4 => Snorm16x2, + 5 => Snorm16x4, + 6 => Unorm8x2, + 7 => Snorm8x2, + ]; + + let attributes_block_1 = &wgpu::vertex_attr_array![ + 0 => Uint8x4, + 1 => Uint16x2, + 2 => Uint16x4, + 3 => Sint8x4, + 4 => Sint16x2, + 5 => Sint16x4, + 6 => Uint8x2, + 7 => Sint8x2, + ]; + + let attributes_block_2 = &wgpu::vertex_attr_array![ + 0 => Uint32, + 1 => Uint32x2, + 2 => Uint32x3, + 3 => Uint32x4, + ]; + + let attributes_block_3 = &wgpu::vertex_attr_array![ + 0 => Sint32, + 1 => Sint32x2, + 2 => Sint32x3, + 3 => Sint32x4, + ]; + + let attributes_block_4 = &wgpu::vertex_attr_array![ + 0 => Float32, + 1 => Float32x2, + 2 => Float32x3, + 3 => Float32x4, + // TODO(SHADER_F16) + /* + 4 => Float16x2, + 5 => Float16x4, + */ + ]; + + let tests = vec![ + Test { + case: TestCase::UnormsAndSnorms, + entry_point: "vertex_block_0", + attributes: attributes_block_0, + input: &[ + 128u8, 128u8, 128u8, 128u8, // Unorm8x4 (0.5, 0.5, 0.5, 0.5) + 0u8, 128u8, 0u8, 128u8, // Unorm16x2 (0.5, 0.5) + 0u8, 64u8, 0u8, 64u8, 0u8, 64u8, 0u8, + 64u8, // Unorm16x4 (0.25, 0.25, 0.25, 0.25) + 127u8, 127u8, 127u8, 127u8, // Snorm8x4 (1, 1, 1, 1) + 0u8, 128u8, 0u8, 128u8, // Snorm16x2 (-1, -1) + 0u8, 127u8, 0u8, 127u8, 0u8, 127u8, 0u8, 127u8, // Snorm16x4 (1, 1, 1, 1) + 255u8, 255u8, // Unorm8x2 (1, 1) + 128u8, 128u8, // Snorm8x2 (-1, -1) + ], + checksums: &[0.0, 0.0, 6.0, 4.0, 0.0, 0.0], + }, + Test { + case: TestCase::UintsAndSintsSmall, + entry_point: "vertex_block_1", + attributes: attributes_block_1, + input: &[ + 4u8, 8u8, 16u8, 32u8, // Uint8x4 (4, 8, 16, 32) + 64u8, 0u8, 128u8, 0u8, // Uint16x2 (64, 128) + 0u8, 1u8, 0u8, 2u8, 0u8, 4u8, 0u8, 8u8, // Uint16x4 (256, 512, 1024, 2048) + 127u8, 127u8, 2u8, 0u8, // Sint8x4 (127, 127, 2, 0) + 255u8, 255u8, 1u8, 0u8, // Sint16x2 (-1, 1) + 128u8, 255u8, 128u8, 255u8, 0u8, 1u8, 240u8, + 255u8, // Sint16x4 (-128, -128, 256, -16) + 1u8, 2u8, // Uint8x2 (1, 2) + 128u8, 128u8, // Sint8x2 (-128, -128) + ], + checksums: &[4095.0, -16.0, 0.0, 0.0, 0.0, 0.0], + }, + Test { + case: TestCase::UintsBig, + entry_point: "vertex_block_2", + attributes: attributes_block_2, + input: &[ + 1u8, 0u8, 0u8, 0u8, // Uint32x2 (1) + 2u8, 0u8, 0u8, 0u8, 4u8, 0u8, 0u8, 0u8, // Uint32x2 (2, 4) + 8u8, 0u8, 0u8, 0u8, 16u8, 0u8, 0u8, 0u8, 32u8, 0u8, 0u8, + 0u8, // Uint32x3 (8, 16, 32) + 64u8, 0u8, 0u8, 0u8, 128u8, 0u8, 0u8, 0u8, 0u8, 1u8, 0u8, 0u8, 0u8, 2u8, 0u8, + 0u8, // Uint32x4 (64, 128, 256, 512) + ], + checksums: &[1023.0, 0.0, 0.0, 0.0, 0.0, 0.0], + }, + Test { + case: TestCase::SintsBig, + entry_point: "vertex_block_3", + attributes: attributes_block_3, + input: &[ + 128u8, 255u8, 255u8, 255u8, // Sint32 (-128) + 120u8, 0u8, 0u8, 0u8, 8u8, 0u8, 0u8, 0u8, // Sint32x2 (120, 8) + 252u8, 255u8, 255u8, 255u8, 2u8, 0u8, 0u8, 0u8, 2u8, 0u8, 0u8, + 0u8, // Sint32x3 (-4, 2, 2) + 24u8, 252u8, 255u8, 255u8, 88u8, 2u8, 0u8, 0u8, 44u8, 1u8, 0u8, 0u8, 99u8, 0u8, + 0u8, 0u8, // Sint32x4 (-1000, 600, 300, 99) + ], + checksums: &[0.0, -1.0, 0.0, 0.0, 0.0, 0.0], + }, + Test { + case: TestCase::Floats, + entry_point: "vertex_block_4", + attributes: attributes_block_4, + input: &[ + 0u8, 0u8, 0u8, 63u8, // Float32 (0.5) + 0u8, 0u8, 0u8, 191u8, 0u8, 0u8, 128u8, 64u8, // Float32x2 (-0.5, 4.0) + 0u8, 0u8, 0u8, 192u8, 0u8, 0u8, 204u8, 194u8, 0u8, 0u8, 200u8, + 66u8, // Float32x3 (-2.0, -102.0, 100.0) + 0u8, 0u8, 92u8, 66u8, 0u8, 0u8, 72u8, 194u8, 0u8, 0u8, 32u8, 65u8, 0u8, 0u8, 128u8, + 63u8, // Float32x4 (55.0, -50.0, 10.0, 1.0) + // TODO(SHADER_F16) + // Expected value is -1.5 + /* + 0u8, 60u8, 85u8, 53u8, // Float16x2 (1.0, 0.33) + 71u8, 57u8, 0u8, 192u8, 0u8, 188u8, 0u8, 184u8, // Float16x4 (0.66, -2.0, -1.0, -0.5) + */ + ], + checksums: &[0.0, 0.0, 0.0, 0.0, 0.0, 16.0], + }, + ]; + + vertex_formats_common(ctx, &tests).await; +} + +async fn vertex_formats_10_10_10_2(ctx: TestingContext) { + let attributes_block_5 = &wgpu::vertex_attr_array![ + 0 => Unorm10_10_10_2, + ]; + + let tests = vec![Test { + case: TestCase::Unorm1010102, + entry_point: "vertex_block_5", + attributes: attributes_block_5, + input: &[ + // We are aiming for rgba of (0.5, 0.5, 0.5, 0.66) + // Packing AA BB BBBB BBBB GGGG GGGG GG RR RRRR RRRR + // Binary 10 10 0000 0000 1000 0000 00 10 0000 0000 + // Hex A0 08 02 00 + // Decimal 160 8 2 0 + // unorm 0.66 0.5 0.5 0.5 = 2.16 + 0u8, 2u8, 8u8, 160u8, // Unorm10_10_10_2 + ], + checksums: &[0.0, 0.0, 2.16, 0.0, 0.0, 0.0], + }]; + + vertex_formats_common(ctx, &tests).await; +} + +async fn vertex_formats_common(ctx: TestingContext, tests: &[Test<'_>]) { + let shader = ctx + .device + .create_shader_module(wgpu::include_wgsl!("draw.vert.wgsl")); + + let bgl = ctx + .device + .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &[wgpu::BindGroupLayoutEntry { + binding: 0, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: NonZeroU64::new(4), + }, + visibility: wgpu::ShaderStages::VERTEX, + count: None, + }], + }); + + let ppl = ctx + .device + .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bgl], + push_constant_ranges: &[], + }); + + let dummy = ctx + .device + .create_texture_with_data( + &ctx.queue, + &wgpu::TextureDescriptor { + label: Some("dummy"), + size: wgpu::Extent3d { + width: 1, + height: 1, + depth_or_array_layers: 1, + }, + mip_level_count: 1, + sample_count: 1, + dimension: wgpu::TextureDimension::D2, + format: wgpu::TextureFormat::Rgba8Unorm, + usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_DST, + view_formats: &[], + }, + wgpu::util::TextureDataOrder::LayerMajor, + &[0, 0, 0, 1], + ) + .create_view(&wgpu::TextureViewDescriptor::default()); + + let mut failed = false; + for test in tests { + let buffer_input = ctx.device.create_buffer_init(&BufferInitDescriptor { + label: None, + contents: bytemuck::cast_slice(test.input), + usage: wgpu::BufferUsages::VERTEX, + }); + + let pipeline_desc = wgpu::RenderPipelineDescriptor { + label: None, + layout: Some(&ppl), + vertex: wgpu::VertexState { + buffers: &[wgpu::VertexBufferLayout { + array_stride: 0, // Calculate, please! + step_mode: wgpu::VertexStepMode::Vertex, + attributes: test.attributes, + }], + module: &shader, + entry_point: test.entry_point, + compilation_options: Default::default(), + }, + primitive: wgpu::PrimitiveState::default(), + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + fragment: Some(wgpu::FragmentState { + module: &shader, + entry_point: "fragment_main", + compilation_options: Default::default(), + targets: &[Some(wgpu::ColorTargetState { + format: wgpu::TextureFormat::Rgba8Unorm, + blend: None, + write_mask: wgpu::ColorWrites::ALL, + })], + }), + multiview: None, + cache: None, + }; + + let pipeline = ctx.device.create_render_pipeline(&pipeline_desc); + + let expected = test.checksums; + let buffer_size = (std::mem::size_of_val(&expected[0]) * expected.len()) as u64; + let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: buffer_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: buffer_size, + usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::STORAGE, + mapped_at_creation: false, + }); + + let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor { + label: None, + layout: &bgl, + entries: &[wgpu::BindGroupEntry { + binding: 0, + resource: gpu_buffer.as_entire_binding(), + }], + }); + + let mut encoder1 = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()); + + let mut rpass = encoder1.begin_render_pass(&wgpu::RenderPassDescriptor { + label: None, + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + ops: wgpu::Operations::default(), + resolve_target: None, + view: &dummy, + })], + depth_stencil_attachment: None, + timestamp_writes: None, + occlusion_query_set: None, + }); + + rpass.set_vertex_buffer(0, buffer_input.slice(..)); + rpass.set_pipeline(&pipeline); + rpass.set_bind_group(0, &bg, &[]); + + // Draw three vertices and no instance, which is enough to generate the + // checksums. + rpass.draw(0..3, 0..1); + + drop(rpass); + + let mut encoder2 = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()); + + encoder2.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, buffer_size); + + // See https://github.com/gfx-rs/wgpu/issues/4732 for why this is split between two submissions + // with a hard wait in between. + ctx.queue.submit([encoder1.finish()]); + ctx.async_poll(wgpu::Maintain::wait()) + .await + .panic_on_timeout(); + ctx.queue.submit([encoder2.finish()]); + let slice = cpu_buffer.slice(..); + slice.map_async(wgpu::MapMode::Read, |_| ()); + ctx.async_poll(wgpu::Maintain::wait()) + .await + .panic_on_timeout(); + let data: Vec = bytemuck::cast_slice(&slice.get_mapped_range()).to_vec(); + + let case_name = format!("Case {:?}", test.case); + + // Calculate the difference between data and expected. Since the data is + // a bunch of float checksums, we allow a fairly large epsilon, which helps + // with the accumulation of float rounding errors. + const EPSILON: f32 = 0.05; + + let mut deltas = data.iter().zip(expected.iter()).map(|(d, e)| (d - e).abs()); + if deltas.any(|x| x > EPSILON) { + eprintln!( + "Failed: Got: {:?} Expected: {:?} - {case_name}", + data, expected, + ); + failed = true; + continue; + } + + eprintln!("Passed: {case_name}"); + } + + assert!(!failed); +} + +#[gpu_test] +static VERTEX_FORMATS_ALL: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .test_features_limits() + .features(wgpu::Features::VERTEX_WRITABLE_STORAGE), + ) + .run_async(vertex_formats_all); + +// Some backends can handle Unorm-10-10-2, but GL backends seem to throw this error: +// Validation Error: GL_INVALID_ENUM in glVertexAttribFormat(type = GL_UNSIGNED_INT_10_10_10_2) +#[gpu_test] +static VERTEX_FORMATS_10_10_10_2: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .expect_fail(FailureCase::backend(wgpu::Backends::GL)) + .test_features_limits() + .features(wgpu::Features::VERTEX_WRITABLE_STORAGE), + ) + .run_async(vertex_formats_10_10_10_2); diff --git a/tests/tests/vertex_indices/mod.rs b/tests/tests/vertex_indices/mod.rs index 59048ef31c1..dcc2ca82f54 100644 --- a/tests/tests/vertex_indices/mod.rs +++ b/tests/tests/vertex_indices/mod.rs @@ -166,7 +166,6 @@ struct Test { id_source: IdSource, draw_call_kind: DrawCallKind, encoder_kind: EncoderKind, - vertex_pulling_transform: bool, } impl Test { @@ -280,15 +279,6 @@ async fn vertex_index_common(ctx: TestingContext) { cache: None, }; let builtin_pipeline = ctx.device.create_render_pipeline(&pipeline_desc); - pipeline_desc - .vertex - .compilation_options - .vertex_pulling_transform = true; - let builtin_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc); - pipeline_desc - .vertex - .compilation_options - .vertex_pulling_transform = false; pipeline_desc.vertex.entry_point = "vs_main_buffers"; pipeline_desc.vertex.buffers = &[ @@ -304,15 +294,6 @@ async fn vertex_index_common(ctx: TestingContext) { }, ]; let buffer_pipeline = ctx.device.create_render_pipeline(&pipeline_desc); - pipeline_desc - .vertex - .compilation_options - .vertex_pulling_transform = true; - let buffer_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc); - pipeline_desc - .vertex - .compilation_options - .vertex_pulling_transform = false; let dummy = ctx .device @@ -341,18 +322,12 @@ async fn vertex_index_common(ctx: TestingContext) { .cartesian_product(IdSource::iter()) .cartesian_product(DrawCallKind::iter()) .cartesian_product(EncoderKind::iter()) - .cartesian_product([false, true]) - .map( - |((((case, id_source), draw_call_kind), encoder_kind), vertex_pulling_transform)| { - Test { - case, - id_source, - draw_call_kind, - encoder_kind, - vertex_pulling_transform, - } - }, - ) + .map(|(((case, id_source), draw_call_kind), encoder_kind)| Test { + case, + id_source, + draw_call_kind, + encoder_kind, + }) .collect::>(); let features = ctx.adapter.features(); @@ -360,20 +335,8 @@ async fn vertex_index_common(ctx: TestingContext) { let mut failed = false; for test in tests { let pipeline = match test.id_source { - IdSource::Buffers => { - if test.vertex_pulling_transform { - &buffer_pipeline_vpt - } else { - &buffer_pipeline - } - } - IdSource::Builtins => { - if test.vertex_pulling_transform { - &builtin_pipeline_vpt - } else { - &builtin_pipeline - } - } + IdSource::Buffers => &buffer_pipeline, + IdSource::Builtins => &builtin_pipeline, }; let expected = test.expectation(&ctx); diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs index e5643a3da92..812b2f84546 100644 --- a/wgpu-core/src/device/global.rs +++ b/wgpu-core/src/device/global.rs @@ -1451,7 +1451,6 @@ impl Global { .vertex .stage .zero_initialize_workgroup_memory, - vertex_pulling_transform: desc.vertex.stage.vertex_pulling_transform, }; ResolvedVertexState { stage, @@ -1478,7 +1477,6 @@ impl Global { .vertex .stage .zero_initialize_workgroup_memory, - vertex_pulling_transform: state.stage.vertex_pulling_transform, }; Some(ResolvedFragmentState { stage, @@ -1688,7 +1686,6 @@ impl Global { entry_point: desc.stage.entry_point.clone(), constants: desc.stage.constants.clone(), zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory, - vertex_pulling_transform: desc.stage.vertex_pulling_transform, }; let desc = ResolvedComputePipelineDescriptor { diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs index c364711f5dc..5984ba1497c 100644 --- a/wgpu-core/src/device/resource.rs +++ b/wgpu-core/src/device/resource.rs @@ -2665,7 +2665,6 @@ impl Device { entry_point: final_entry_point_name.as_ref(), constants: desc.stage.constants.as_ref(), zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory, - vertex_pulling_transform: false, }, cache: cache.as_ref().and_then(|it| it.raw.as_ref()), }; @@ -3085,7 +3084,6 @@ impl Device { entry_point: &vertex_entry_point_name, constants: stage_desc.constants.as_ref(), zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory, - vertex_pulling_transform: stage_desc.vertex_pulling_transform, } }; @@ -3141,7 +3139,6 @@ impl Device { zero_initialize_workgroup_memory: fragment_state .stage .zero_initialize_workgroup_memory, - vertex_pulling_transform: false, }) } None => None, diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs index b422ced5eb0..1d7612900d5 100644 --- a/wgpu-core/src/pipeline.rs +++ b/wgpu-core/src/pipeline.rs @@ -147,8 +147,6 @@ pub struct ProgrammableStageDescriptor<'a> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, - /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. - pub vertex_pulling_transform: bool, } /// Describes a programmable pipeline stage. @@ -176,8 +174,6 @@ pub struct ResolvedProgrammableStageDescriptor<'a, A: HalApi> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, - /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. - pub vertex_pulling_transform: bool, } /// Number of implicit bind groups derived at pipeline creation. diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs index a657b161b42..8395a5507e6 100644 --- a/wgpu-hal/examples/halmark/main.rs +++ b/wgpu-hal/examples/halmark/main.rs @@ -257,7 +257,6 @@ impl Example { entry_point: "vs_main", constants: &constants, zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }, vertex_buffers: &[], fragment_stage: Some(hal::ProgrammableStage { @@ -265,7 +264,6 @@ impl Example { entry_point: "fs_main", constants: &constants, zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }), primitive: wgt::PrimitiveState { topology: wgt::PrimitiveTopology::TriangleStrip, diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs index 1cde9fa2516..7cd6547f2c4 100644 --- a/wgpu-hal/examples/ray-traced-triangle/main.rs +++ b/wgpu-hal/examples/ray-traced-triangle/main.rs @@ -379,7 +379,6 @@ impl Example { entry_point: "main", constants: &Default::default(), zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, }, cache: None, }) diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index 6f470f4ddc9..18f132e31a1 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -1862,8 +1862,6 @@ pub struct ProgrammableStage<'a, A: Api> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, - /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. - pub vertex_pulling_transform: bool, } // Rust gets confused about the impl requirements for `A` @@ -1874,7 +1872,6 @@ impl Clone for ProgrammableStage<'_, A> { entry_point: self.entry_point, constants: self.constants, zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory, - vertex_pulling_transform: self.vertex_pulling_transform, } } } diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index efafc98e1b3..d9525999d8b 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -158,7 +158,7 @@ impl super::Device { metal::MTLPrimitiveTopologyClass::Point => true, _ => false, }, - vertex_pulling_transform: stage.vertex_pulling_transform, + vertex_pulling_transform: true, vertex_buffer_mappings: vertex_buffer_mappings.to_vec(), }; diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs index 6485aefcdeb..2ce8b836fb0 100644 --- a/wgpu/src/backend/wgpu_core.rs +++ b/wgpu/src/backend/wgpu_core.rs @@ -1181,10 +1181,6 @@ impl crate::Context for ContextWgpuCore { .vertex .compilation_options .zero_initialize_workgroup_memory, - vertex_pulling_transform: desc - .vertex - .compilation_options - .vertex_pulling_transform, }, buffers: Borrowed(&vertex_buffers), }, @@ -1199,7 +1195,6 @@ impl crate::Context for ContextWgpuCore { zero_initialize_workgroup_memory: frag .compilation_options .zero_initialize_workgroup_memory, - vertex_pulling_transform: false, }, targets: Borrowed(frag.targets), }), @@ -1252,7 +1247,6 @@ impl crate::Context for ContextWgpuCore { zero_initialize_workgroup_memory: desc .compilation_options .zero_initialize_workgroup_memory, - vertex_pulling_transform: false, }, cache: desc.cache.map(|c| c.id.into()), }; diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs index d895b696cf4..190e99f7073 100644 --- a/wgpu/src/lib.rs +++ b/wgpu/src/lib.rs @@ -2059,8 +2059,6 @@ pub struct PipelineCompilationOptions<'a> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, - /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. - pub vertex_pulling_transform: bool, } impl<'a> Default for PipelineCompilationOptions<'a> { @@ -2074,7 +2072,6 @@ impl<'a> Default for PipelineCompilationOptions<'a> { Self { constants, zero_initialize_workgroup_memory: true, - vertex_pulling_transform: false, } } }