diff --git a/impeller/compiler/shader_lib/impeller/gaussian.glsl b/impeller/compiler/shader_lib/impeller/gaussian.glsl index 62874bec96d9c..9dd104d6e4ce2 100644 --- a/impeller/compiler/shader_lib/impeller/gaussian.glsl +++ b/impeller/compiler/shader_lib/impeller/gaussian.glsl @@ -6,51 +6,53 @@ #define GAUSSIAN_GLSL_ #include +#include /// Gaussian distribution function. -float IPGaussian(float x, float sigma) { - float variance = sigma * sigma; - return exp(-0.5 * x * x / variance) / (kSqrtTwoPi * sigma); +float16_t IPGaussian(float16_t x, float16_t sigma) { + float16_t variance = sigma * sigma; + return exp(-0.5hf * x * x / variance) / (float16_t(kSqrtTwoPi) * sigma); } /// Abramowitz and Stegun erf approximation. -float IPErf(float x) { - float a = abs(x); +float16_t IPErf(float16_t x) { + float16_t a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - float b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; - return sign(x) * (1 - 1 / (b * b * b * b)); + float16_t b = + (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; + return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); } /// Vec2 variation for the Abramowitz and Stegun erf approximation. -vec2 IPVec2Erf(vec2 x) { - vec2 a = abs(x); +f16vec2 IPVec2Erf(f16vec2 x) { + f16vec2 a = abs(x); // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1 - vec2 b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0; - return sign(x) * (1 - 1 / (b * b * b * b)); + f16vec2 b = (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf; + return sign(x) * (1.0hf - 1.0hf / (b * b * b * b)); } /// The indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -float IPGaussianIntegral(float x, float sigma) { +float16_t IPGaussianIntegral(float16_t x, float16_t sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1 + IPErf(x * (kHalfSqrtTwo / sigma))) * 0.5; + return (1.0hf + IPErf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; } /// Vec2 variation for the indefinite integral of the Gaussian function. /// Uses a very close approximation of Erf. -vec2 IPVec2GaussianIntegral(vec2 x, float sigma) { +f16vec2 IPVec2GaussianIntegral(f16vec2 x, float16_t sigma) { // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2 - return (1 + IPVec2Erf(x * (kHalfSqrtTwo / sigma))) * 0.5; + return (1.0hf + IPVec2Erf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf; } /// Simpler (but less accurate) approximation of the Gaussian integral. -vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) { - return 1 / (1 + exp(-kSqrtThree / sigma * x)); +f16vec2 IPVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) { + return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x)); } /// Simple logistic sigmoid with a domain of [-1, 1] and range of [0, 1]. -float IPSigmoid(float x) { - return 1.03731472073 / (1 + exp(-4 * x)) - 0.0186573603638; +float16_t IPSigmoid(float16_t x) { + return 1.03731472073hf / (1.0hf + exp(-4.0hf * x)) - 0.0186573603638hf; } #endif diff --git a/impeller/compiler/shader_lib/impeller/texture.glsl b/impeller/compiler/shader_lib/impeller/texture.glsl index a952c0b9c640e..4b7ba3235d640 100644 --- a/impeller/compiler/shader_lib/impeller/texture.glsl +++ b/impeller/compiler/shader_lib/impeller/texture.glsl @@ -141,6 +141,15 @@ vec4 IPSampleDecal(sampler2D texture_sampler, vec2 coords) { return texture(texture_sampler, coords); } +/// Sample a texture with decal tile mode. +f16vec4 IPHalfSampleDecal(f16sampler2D texture_sampler, vec2 coords) { + if (any(lessThan(coords, vec2(0))) || + any(greaterThanEqual(coords, vec2(1)))) { + return f16vec4(0.0); + } + return texture(texture_sampler, coords); +} + /// Sample a texture, emulating a specific tile mode. /// /// This is useful for Impeller graphics backend that don't have native support diff --git a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc index fb5e6ad498fcf..3f0a1f467c9d4 100644 --- a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc +++ b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc @@ -198,10 +198,9 @@ std::optional DirectionalGaussianBlurFilterContents::RenderFilter( frag_info.blur_radius = r.radius; // The blur direction is in input UV space. - frag_info.blur_direction = - pass_transform.Invert().TransformDirection(Vector2(1, 0)).Normalize(); - - frag_info.texture_size = Point(input_snapshot->GetCoverage().value().size); + frag_info.blur_uv_offset = + pass_transform.Invert().TransformDirection(Vector2(1, 0)).Normalize() / + Point(input_snapshot->GetCoverage().value().size); Command cmd; cmd.label = SPrintF("Gaussian Blur Filter (Radius=%.2f)", diff --git a/impeller/entity/shaders/border_mask_blur.frag b/impeller/entity/shaders/border_mask_blur.frag index b28dfc8210380..88abc3fc64d08 100644 --- a/impeller/entity/shaders/border_mask_blur.frag +++ b/impeller/entity/shaders/border_mask_blur.frag @@ -15,42 +15,42 @@ // integral (using an erf approximation) to the 4 edges of the UV rectangle and // multiplying them. -uniform sampler2D texture_sampler; +uniform f16sampler2D texture_sampler; uniform FragInfo { - float src_factor; - float inner_blur_factor; - float outer_blur_factor; + float16_t src_factor; + float16_t inner_blur_factor; + float16_t outer_blur_factor; - vec2 sigma_uv; + f16vec2 sigma_uv; } frag_info; in vec2 v_texture_coords; -out vec4 frag_color; +out f16vec4 frag_color; -float BoxBlurMask(vec2 uv) { +float16_t BoxBlurMask(f16vec2 uv) { // LTRB - return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // - IPGaussianIntegral(1 - uv.x, frag_info.sigma_uv.x) * // - IPGaussianIntegral(1 - uv.y, frag_info.sigma_uv.y); + return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) * // + IPGaussianIntegral(1.0hf - uv.x, frag_info.sigma_uv.x) * // + IPGaussianIntegral(1.0hf - uv.y, frag_info.sigma_uv.y); } void main() { - vec4 image_color = texture(texture_sampler, v_texture_coords); - float blur_factor = BoxBlurMask(v_texture_coords); + f16vec4 image_color = texture(texture_sampler, v_texture_coords); + float16_t blur_factor = BoxBlurMask(f16vec2(v_texture_coords)); - float within_bounds = - float(v_texture_coords.x >= 0 && v_texture_coords.y >= 0 && - v_texture_coords.x < 1 && v_texture_coords.y < 1); - float inner_factor = + float16_t within_bounds = + float16_t(v_texture_coords.x >= 0.0 && v_texture_coords.y >= 0.0 && + v_texture_coords.x < 1.0 && v_texture_coords.y < 1.0); + float16_t inner_factor = (frag_info.inner_blur_factor * blur_factor + frag_info.src_factor) * within_bounds; - float outer_factor = - frag_info.outer_blur_factor * blur_factor * (1 - within_bounds); + float16_t outer_factor = + frag_info.outer_blur_factor * blur_factor * (1.0hf - within_bounds); - float mask_factor = inner_factor + outer_factor; + float16_t mask_factor = inner_factor + outer_factor; frag_color = image_color * mask_factor; } diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl index 4a218303efc27..9df092e1299e4 100644 --- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl +++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl @@ -18,35 +18,34 @@ #include #include -uniform sampler2D texture_sampler; +uniform f16sampler2D texture_sampler; uniform BlurInfo { - vec2 texture_size; - vec2 blur_direction; + f16vec2 blur_uv_offset; // The blur sigma and radius have a linear relationship which is defined // host-side, but both are useful controls here. Sigma (pixels per standard // deviation) is used to define the gaussian function itself, whereas the // radius is used to limit how much of the function is integrated. - float blur_sigma; - float blur_radius; + float16_t blur_sigma; + float16_t blur_radius; } blur_info; #if ENABLE_ALPHA_MASK -uniform sampler2D alpha_mask_sampler; +uniform f16sampler2D alpha_mask_sampler; uniform MaskInfo { - float src_factor; - float inner_blur_factor; - float outer_blur_factor; + float16_t src_factor; + float16_t inner_blur_factor; + float16_t outer_blur_factor; } mask_info; #endif -vec4 Sample(sampler2D tex, vec2 coords) { +f16vec4 Sample(f16sampler2D tex, vec2 coords) { #if ENABLE_DECAL_SPECIALIZATION - return IPSampleDecal(tex, coords); + return IPHalfSampleDecal(tex, coords); #else return texture(tex, coords); #endif @@ -55,31 +54,35 @@ vec4 Sample(sampler2D tex, vec2 coords) { in vec2 v_texture_coords; in vec2 v_src_texture_coords; -out vec4 frag_color; +out f16vec4 frag_color; void main() { - vec4 total_color = vec4(0); - float gaussian_integral = 0; - vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size; + f16vec4 total_color = f16vec4(0.0hf); + float16_t gaussian_integral = 0.0hf; - for (float i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { - float gaussian = IPGaussian(i, blur_info.blur_sigma); + for (float16_t i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) { + float16_t gaussian = IPGaussian(i, blur_info.blur_sigma); gaussian_integral += gaussian; total_color += - gaussian * - Sample(texture_sampler, // sampler - v_texture_coords + blur_uv_offset * i // texture coordinates - ); + gaussian * Sample(texture_sampler, // sampler + v_texture_coords + blur_info.blur_uv_offset * + i // texture coordinates + ); } frag_color = total_color / gaussian_integral; #if ENABLE_ALPHA_MASK - vec4 src_color = Sample(alpha_mask_sampler, // sampler - v_src_texture_coords // texture coordinates + f16vec4 src_color = Sample(alpha_mask_sampler, // sampler + v_src_texture_coords // texture coordinates ); - float blur_factor = mask_info.inner_blur_factor * float(src_color.a > 0) + - mask_info.outer_blur_factor * float(src_color.a == 0); + + float16_t blur_factor; + if (src_color.a > 0.0hf) { + blur_factor = mask_info.inner_blur_factor; + } else if (src_color.a == 0.0hf) { + blur_factor = mask_info.outer_blur_factor; + } frag_color = frag_color * blur_factor + src_color * mask_info.src_factor; #endif diff --git a/impeller/entity/shaders/rrect_blur.frag b/impeller/entity/shaders/rrect_blur.frag index 5b0ddff80976c..0369ef7e93bbf 100644 --- a/impeller/entity/shaders/rrect_blur.frag +++ b/impeller/entity/shaders/rrect_blur.frag @@ -6,58 +6,61 @@ #include uniform FragInfo { - vec4 color; - float blur_sigma; - vec2 rect_size; - float corner_radius; + f16vec4 color; + f16vec2 rect_size; + float16_t blur_sigma; + float16_t corner_radius; } frag_info; in vec2 v_position; -out vec4 frag_color; +out f16vec4 frag_color; const int kSampleCount = 4; -float RRectDistance(vec2 sample_position, vec2 half_size) { - vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; - return length(max(space, 0.0)) + min(max(space.x, space.y), 0.0) - - frag_info.corner_radius; +float16_t RRectDistance(f16vec2 sample_position, f16vec2 half_size) { + f16vec2 space = abs(sample_position) - half_size + frag_info.corner_radius; + return length(max(space, float16_t(0.0hf))) + + min(max(space.x, space.y), float16_t(0.0hf)) - frag_info.corner_radius; } /// Closed form unidirectional rounded rect blur mask solution using the /// analytical Gaussian integral (with approximated erf). -float RRectShadowX(vec2 sample_position, vec2 half_size) { +float16_t RRectShadowX(f16vec2 sample_position, f16vec2 half_size) { // Compute the X direction distance field (not incorporating the Y distance) // for the rounded rect. - float space = - min(0, half_size.y - frag_info.corner_radius - abs(sample_position.y)); - float rrect_distance = + float16_t space = + min(float16_t(0.0hf), + half_size.y - frag_info.corner_radius - abs(sample_position.y)); + float16_t rrect_distance = half_size.x - frag_info.corner_radius + - sqrt(max(0, frag_info.corner_radius * frag_info.corner_radius - - space * space)); + sqrt(max( + float16_t(0.0hf), + frag_info.corner_radius * frag_info.corner_radius - space * space)); // Map the linear distance field to the approximate Gaussian integral. - vec2 integral = IPVec2FastGaussianIntegral( - sample_position.x + vec2(-rrect_distance, rrect_distance), + f16vec2 integral = IPVec2FastGaussianIntegral( + sample_position.x + f16vec2(-rrect_distance, rrect_distance), frag_info.blur_sigma); return integral.y - integral.x; } -float RRectShadow(vec2 sample_position, vec2 half_size) { +float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) { // Limit the sampling range to 3 standard deviations in the Y direction from // the kernel center to incorporate 99.7% of the color contribution. - float half_sampling_range = frag_info.blur_sigma * 3; + float16_t half_sampling_range = frag_info.blur_sigma * 3.0hf; - float begin_y = max(-half_sampling_range, sample_position.y - half_size.y); - float end_y = min(half_sampling_range, sample_position.y + half_size.y); - float interval = (end_y - begin_y) / kSampleCount; + float16_t begin_y = + max(-half_sampling_range, sample_position.y - half_size.y); + float16_t end_y = min(half_sampling_range, sample_position.y + half_size.y); + float16_t interval = (end_y - begin_y) / float16_t(kSampleCount); // Sample the X blur kSampleCount times, weighted by the Gaussian function. - float result = 0; + float16_t result = 0.0hf; for (int sample_i = 0; sample_i < kSampleCount; sample_i++) { - float y = begin_y + interval * (sample_i + 0.5); - result += RRectShadowX(vec2(sample_position.x, sample_position.y - y), + float16_t y = begin_y + interval * (float16_t(sample_i) + 0.5hf); + result += RRectShadowX(f16vec2(sample_position.x, sample_position.y - y), half_size) * IPGaussian(y, frag_info.blur_sigma) * interval; } @@ -68,10 +71,10 @@ float RRectShadow(vec2 sample_position, vec2 half_size) { void main() { frag_color = frag_info.color; - vec2 half_size = frag_info.rect_size * 0.5; - vec2 sample_position = v_position - half_size; + f16vec2 half_size = frag_info.rect_size * 0.5hf; + f16vec2 sample_position = f16vec2(v_position) - half_size; - if (frag_info.blur_sigma > 0) { + if (frag_info.blur_sigma > 0.0hf) { frag_color *= RRectShadow(sample_position, half_size); } else { frag_color *= -RRectDistance(sample_position, half_size); diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json index 6c9fc4d8546d8..da75088513d82 100644 --- a/impeller/tools/malioc.json +++ b/impeller/tools/malioc.json @@ -1440,7 +1440,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 5, + "fp16_arithmetic": 44, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -1448,8 +1448,8 @@ "arith_fma" ], "longest_path_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1470,8 +1470,8 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1483,8 +1483,8 @@ "arith_fma" ], "total_cycles": [ - 0.8125, - 0.8125, + 0.875, + 0.875, 0.203125, 0.25, 0.0, @@ -1495,7 +1495,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 12, - "work_registers_used": 22 + "work_registers_used": 18 } } } @@ -3276,7 +3276,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 55, + "fp16_arithmetic": 53, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -3305,9 +3305,9 @@ "arith_cvt" ], "shortest_path_cycles": [ - 0.3125, - 0.09375, - 0.3125, + 0.328125, + 0.078125, + 0.328125, 0.25, 0.0, 0.25, @@ -3318,9 +3318,9 @@ "arith_cvt" ], "total_cycles": [ - 0.515625, - 0.265625, - 0.515625, + 0.578125, + 0.25, + 0.578125, 0.5, 0.0, 0.5, @@ -3348,7 +3348,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 47, + "fp16_arithmetic": 45, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -3377,9 +3377,9 @@ "texture" ], "shortest_path_cycles": [ - 0.15625, - 0.09375, - 0.15625, + 0.1875, + 0.078125, + 0.1875, 0.0625, 0.0, 0.25, @@ -3390,9 +3390,9 @@ "texture" ], "total_cycles": [ - 0.265625, - 0.265625, + 0.34375, 0.25, + 0.34375, 0.125, 0.0, 0.5, @@ -3402,7 +3402,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 14, - "work_registers_used": 15 + "work_registers_used": 14 } } } @@ -5803,7 +5803,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 10, + "fp16_arithmetic": 86, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -5811,9 +5811,9 @@ "arith_fma" ], "longest_path_cycles": [ - 0.8125, - 0.8125, - 0.234375, + 0.90625, + 0.90625, + 0.265625, 0.25, 0.0, 0.25, @@ -5833,9 +5833,9 @@ "arith_fma" ], "shortest_path_cycles": [ - 0.8125, - 0.8125, - 0.203125, + 0.90625, + 0.90625, + 0.234375, 0.25, 0.0, 0.25, @@ -5846,9 +5846,9 @@ "arith_fma" ], "total_cycles": [ - 0.8125, - 0.8125, - 0.234375, + 0.90625, + 0.90625, + 0.265625, 0.25, 0.0, 0.25, @@ -5857,8 +5857,8 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 10, - "work_registers_used": 32 + "uniform_registers_used": 12, + "work_registers_used": 29 } } }, @@ -5903,7 +5903,7 @@ }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 3 + "work_registers_used": 2 } } } @@ -6633,7 +6633,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 63, + "fp16_arithmetic": 66, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6665,7 +6665,7 @@ ], "shortest_path_cycles": [ 0.25, - 0.171875, + 0.125, 0.25, 0.25, 0.0, @@ -6674,14 +6674,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_sfu", - "varying", - "texture" + "arith_cvt" ], "total_cycles": [ - 0.5, - 0.359375, - 0.484375, + 0.53125, + 0.328125, + 0.53125, 0.5, 0.0, 0.5, @@ -6690,7 +6688,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 10, + "uniform_registers_used": 12, "work_registers_used": 21 } } @@ -6721,7 +6719,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 4.619999885559082, + 3.299999952316284, 2.0, 0.0 ], @@ -6729,13 +6727,13 @@ "arithmetic" ], "total_cycles": [ - 8.666666984558105, + 7.666666507720947, 2.0, 2.0 ] }, "thread_occupancy": 100, - "uniform_registers_used": 2, + "uniform_registers_used": 1, "work_registers_used": 4 } } @@ -6754,7 +6752,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 58, + "fp16_arithmetic": 61, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6783,9 +6781,9 @@ "texture" ], "shortest_path_cycles": [ - 0.171875, - 0.171875, - 0.109375, + 0.140625, + 0.125, + 0.140625, 0.0625, 0.0, 0.25, @@ -6796,9 +6794,9 @@ "texture" ], "total_cycles": [ - 0.359375, - 0.359375, - 0.234375, + 0.328125, + 0.328125, + 0.328125, 0.125, 0.0, 0.5, @@ -6838,7 +6836,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 3.299999952316284, + 2.309999942779541, 2.0, 1.0 ], @@ -6846,14 +6844,14 @@ "arithmetic" ], "total_cycles": [ - 5.333333492279053, + 5.0, 2.0, 2.0 ] }, "thread_occupancy": 100, - "uniform_registers_used": 2, - "work_registers_used": 4 + "uniform_registers_used": 1, + "work_registers_used": 3 } } } @@ -6871,7 +6869,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 61, + "fp16_arithmetic": 70, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -6897,12 +6895,13 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt" + "arith_cvt", + "arith_sfu" ], "shortest_path_cycles": [ - 0.078125, - 0.046875, - 0.078125, + 0.0625, + 0.03125, + 0.0625, 0.0625, 0.0, 0.0, @@ -6915,7 +6914,7 @@ "total_cycles": [ 0.3125, 0.234375, - 0.296875, + 0.28125, 0.3125, 0.0, 0.25, @@ -6955,7 +6954,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.9700000286102295, + 1.649999976158142, 1.0, 0.0 ], @@ -6963,7 +6962,7 @@ "arithmetic" ], "total_cycles": [ - 6.666666507720947, + 5.0, 1.0, 1.0 ] @@ -6988,7 +6987,7 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 57, + "fp16_arithmetic": 66, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ @@ -7014,12 +7013,13 @@ ], "shortest_path_bound_pipelines": [ "arith_total", - "arith_cvt" + "arith_cvt", + "arith_sfu" ], "shortest_path_cycles": [ - 0.078125, - 0.046875, - 0.078125, + 0.0625, + 0.03125, + 0.0625, 0.0625, 0.0, 0.0, @@ -7032,7 +7032,7 @@ "total_cycles": [ 0.234375, 0.234375, - 0.203125, + 0.1875, 0.125, 0.0, 0.25, @@ -7042,7 +7042,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 8, - "work_registers_used": 20 + "work_registers_used": 19 } } }, @@ -7072,7 +7072,7 @@ "arithmetic" ], "shortest_path_cycles": [ - 2.309999942779541, + 1.649999976158142, 1.0, 0.0 ], @@ -7080,14 +7080,14 @@ "arithmetic" ], "total_cycles": [ - 4.333333492279053, + 3.6666667461395264, 1.0, 1.0 ] }, "thread_occupancy": 100, "uniform_registers_used": 1, - "work_registers_used": 4 + "work_registers_used": 2 } } } @@ -8915,17 +8915,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 33, + "fp16_arithmetic": 68, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "longest_path_cycles": [ - 1.5125000476837158, - 1.5125000476837158, - 0.546875, + 1.5, + 1.3875000476837158, + 0.737500011920929, 1.5, 0.0, 0.125, @@ -8955,12 +8955,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "total_cycles": [ - 1.6375000476837158, - 1.6375000476837158, - 0.578125, + 1.5625, + 1.5125000476837158, + 0.762499988079071, 1.5625, 0.0, 0.125, @@ -8969,7 +8969,7 @@ }, "stack_spill_bytes": 0, "thread_occupancy": 100, - "uniform_registers_used": 20, + "uniform_registers_used": 16, "work_registers_used": 32 } } @@ -8984,12 +8984,12 @@ "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ - null + "arithmetic" ], "longest_path_cycles": [ - null, - null, - null + 22.110000610351562, + 1.0, + 0.0 ], "pipelines": [ "arithmetic", @@ -9008,14 +9008,14 @@ "arithmetic" ], "total_cycles": [ - 10.666666984558105, + 10.0, 1.0, 0.0 ] }, "thread_occupancy": 100, - "uniform_registers_used": 1, - "work_registers_used": 4 + "uniform_registers_used": 2, + "work_registers_used": 3 } } } @@ -12265,17 +12265,17 @@ "uses_late_zs_update": false, "variants": { "Main": { - "fp16_arithmetic": 37, + "fp16_arithmetic": 65, "has_stack_spilling": false, "performance": { "longest_path_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "longest_path_cycles": [ - 1.5499999523162842, - 1.5499999523162842, - 0.515625, + 1.5, + 1.4249999523162842, + 0.699999988079071, 1.5, 0.0, 0.125, @@ -12305,12 +12305,12 @@ ], "total_bound_pipelines": [ "arith_total", - "arith_fma" + "arith_sfu" ], "total_cycles": [ - 1.6749999523162842, - 1.6749999523162842, - 0.5625, + 1.5625, + 1.5499999523162842, + 0.75, 1.5625, 0.0, 0.125, @@ -12320,7 +12320,7 @@ "stack_spill_bytes": 0, "thread_occupancy": 100, "uniform_registers_used": 18, - "work_registers_used": 32 + "work_registers_used": 31 } } }