diff --git a/impeller/compiler/shader_lib/impeller/gaussian.glsl b/impeller/compiler/shader_lib/impeller/gaussian.glsl
index 62874bec96d9c..9dd104d6e4ce2 100644
--- a/impeller/compiler/shader_lib/impeller/gaussian.glsl
+++ b/impeller/compiler/shader_lib/impeller/gaussian.glsl
@@ -6,51 +6,53 @@
 #define GAUSSIAN_GLSL_
 
 #include <impeller/constants.glsl>
+#include <impeller/types.glsl>
 
 /// Gaussian distribution function.
-float IPGaussian(float x, float sigma) {
-  float variance = sigma * sigma;
-  return exp(-0.5 * x * x / variance) / (kSqrtTwoPi * sigma);
+float16_t IPGaussian(float16_t x, float16_t sigma) {
+  float16_t variance = sigma * sigma;
+  return exp(-0.5hf * x * x / variance) / (float16_t(kSqrtTwoPi) * sigma);
 }
 
 /// Abramowitz and Stegun erf approximation.
-float IPErf(float x) {
-  float a = abs(x);
+float16_t IPErf(float16_t x) {
+  float16_t a = abs(x);
   // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1
-  float b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0;
-  return sign(x) * (1 - 1 / (b * b * b * b));
+  float16_t b =
+      (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf;
+  return sign(x) * (1.0hf - 1.0hf / (b * b * b * b));
 }
 
 /// Vec2 variation for the Abramowitz and Stegun erf approximation.
-vec2 IPVec2Erf(vec2 x) {
-  vec2 a = abs(x);
+f16vec2 IPVec2Erf(f16vec2 x) {
+  f16vec2 a = abs(x);
   // 0.278393*x + 0.230389*x^2 + 0.078108*x^4 + 1
-  vec2 b = (0.278393 + (0.230389 + 0.078108 * a * a) * a) * a + 1.0;
-  return sign(x) * (1 - 1 / (b * b * b * b));
+  f16vec2 b = (0.278393hf + (0.230389hf + 0.078108hf * a * a) * a) * a + 1.0hf;
+  return sign(x) * (1.0hf - 1.0hf / (b * b * b * b));
 }
 
 /// The indefinite integral of the Gaussian function.
 /// Uses a very close approximation of Erf.
-float IPGaussianIntegral(float x, float sigma) {
+float16_t IPGaussianIntegral(float16_t x, float16_t sigma) {
   // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2
-  return (1 + IPErf(x * (kHalfSqrtTwo / sigma))) * 0.5;
+  return (1.0hf + IPErf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf;
 }
 
 /// Vec2 variation for the indefinite integral of the Gaussian function.
 /// Uses a very close approximation of Erf.
-vec2 IPVec2GaussianIntegral(vec2 x, float sigma) {
+f16vec2 IPVec2GaussianIntegral(f16vec2 x, float16_t sigma) {
   // ( 1 + erf( x * (sqrt(2) / (2 * sigma) ) ) / 2
-  return (1 + IPVec2Erf(x * (kHalfSqrtTwo / sigma))) * 0.5;
+  return (1.0hf + IPVec2Erf(x * (float16_t(kHalfSqrtTwo) / sigma))) * 0.5hf;
 }
 
 /// Simpler (but less accurate) approximation of the Gaussian integral.
-vec2 IPVec2FastGaussianIntegral(vec2 x, float sigma) {
-  return 1 / (1 + exp(-kSqrtThree / sigma * x));
+f16vec2 IPVec2FastGaussianIntegral(f16vec2 x, float16_t sigma) {
+  return 1.0hf / (1.0hf + exp(float16_t(-kSqrtThree) / sigma * x));
 }
 
 /// Simple logistic sigmoid with a domain of [-1, 1] and range of [0, 1].
-float IPSigmoid(float x) {
-  return 1.03731472073 / (1 + exp(-4 * x)) - 0.0186573603638;
+float16_t IPSigmoid(float16_t x) {
+  return 1.03731472073hf / (1.0hf + exp(-4.0hf * x)) - 0.0186573603638hf;
 }
 
 #endif
diff --git a/impeller/compiler/shader_lib/impeller/texture.glsl b/impeller/compiler/shader_lib/impeller/texture.glsl
index a952c0b9c640e..4b7ba3235d640 100644
--- a/impeller/compiler/shader_lib/impeller/texture.glsl
+++ b/impeller/compiler/shader_lib/impeller/texture.glsl
@@ -141,6 +141,15 @@ vec4 IPSampleDecal(sampler2D texture_sampler, vec2 coords) {
   return texture(texture_sampler, coords);
 }
 
+/// Sample a texture with decal tile mode.
+f16vec4 IPHalfSampleDecal(f16sampler2D texture_sampler, vec2 coords) {
+  if (any(lessThan(coords, vec2(0))) ||
+      any(greaterThanEqual(coords, vec2(1)))) {
+    return f16vec4(0.0);
+  }
+  return texture(texture_sampler, coords);
+}
+
 /// Sample a texture, emulating a specific tile mode.
 ///
 /// This is useful for Impeller graphics backend that don't have native support
diff --git a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
index fb5e6ad498fcf..3f0a1f467c9d4 100644
--- a/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
+++ b/impeller/entity/contents/filters/gaussian_blur_filter_contents.cc
@@ -198,10 +198,9 @@ std::optional<Entity> DirectionalGaussianBlurFilterContents::RenderFilter(
     frag_info.blur_radius = r.radius;
 
     // The blur direction is in input UV space.
-    frag_info.blur_direction =
-        pass_transform.Invert().TransformDirection(Vector2(1, 0)).Normalize();
-
-    frag_info.texture_size = Point(input_snapshot->GetCoverage().value().size);
+    frag_info.blur_uv_offset =
+        pass_transform.Invert().TransformDirection(Vector2(1, 0)).Normalize() /
+        Point(input_snapshot->GetCoverage().value().size);
 
     Command cmd;
     cmd.label = SPrintF("Gaussian Blur Filter (Radius=%.2f)",
diff --git a/impeller/entity/shaders/border_mask_blur.frag b/impeller/entity/shaders/border_mask_blur.frag
index b28dfc8210380..88abc3fc64d08 100644
--- a/impeller/entity/shaders/border_mask_blur.frag
+++ b/impeller/entity/shaders/border_mask_blur.frag
@@ -15,42 +15,42 @@
 // integral (using an erf approximation) to the 4 edges of the UV rectangle and
 // multiplying them.
 
-uniform sampler2D texture_sampler;
+uniform f16sampler2D texture_sampler;
 
 uniform FragInfo {
-  float src_factor;
-  float inner_blur_factor;
-  float outer_blur_factor;
+  float16_t src_factor;
+  float16_t inner_blur_factor;
+  float16_t outer_blur_factor;
 
-  vec2 sigma_uv;
+  f16vec2 sigma_uv;
 }
 frag_info;
 
 in vec2 v_texture_coords;
 
-out vec4 frag_color;
+out f16vec4 frag_color;
 
-float BoxBlurMask(vec2 uv) {
+float16_t BoxBlurMask(f16vec2 uv) {
   // LTRB
-  return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) *      //
-         IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) *      //
-         IPGaussianIntegral(1 - uv.x, frag_info.sigma_uv.x) *  //
-         IPGaussianIntegral(1 - uv.y, frag_info.sigma_uv.y);
+  return IPGaussianIntegral(uv.x, frag_info.sigma_uv.x) *          //
+         IPGaussianIntegral(uv.y, frag_info.sigma_uv.y) *          //
+         IPGaussianIntegral(1.0hf - uv.x, frag_info.sigma_uv.x) *  //
+         IPGaussianIntegral(1.0hf - uv.y, frag_info.sigma_uv.y);
 }
 
 void main() {
-  vec4 image_color = texture(texture_sampler, v_texture_coords);
-  float blur_factor = BoxBlurMask(v_texture_coords);
+  f16vec4 image_color = texture(texture_sampler, v_texture_coords);
+  float16_t blur_factor = BoxBlurMask(f16vec2(v_texture_coords));
 
-  float within_bounds =
-      float(v_texture_coords.x >= 0 && v_texture_coords.y >= 0 &&
-            v_texture_coords.x < 1 && v_texture_coords.y < 1);
-  float inner_factor =
+  float16_t within_bounds =
+      float16_t(v_texture_coords.x >= 0.0 && v_texture_coords.y >= 0.0 &&
+                v_texture_coords.x < 1.0 && v_texture_coords.y < 1.0);
+  float16_t inner_factor =
       (frag_info.inner_blur_factor * blur_factor + frag_info.src_factor) *
       within_bounds;
-  float outer_factor =
-      frag_info.outer_blur_factor * blur_factor * (1 - within_bounds);
+  float16_t outer_factor =
+      frag_info.outer_blur_factor * blur_factor * (1.0hf - within_bounds);
 
-  float mask_factor = inner_factor + outer_factor;
+  float16_t mask_factor = inner_factor + outer_factor;
   frag_color = image_color * mask_factor;
 }
diff --git a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl
index 4a218303efc27..9df092e1299e4 100644
--- a/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl
+++ b/impeller/entity/shaders/gaussian_blur/gaussian_blur.glsl
@@ -18,35 +18,34 @@
 #include <impeller/texture.glsl>
 #include <impeller/types.glsl>
 
-uniform sampler2D texture_sampler;
+uniform f16sampler2D texture_sampler;
 
 uniform BlurInfo {
-  vec2 texture_size;
-  vec2 blur_direction;
+  f16vec2 blur_uv_offset;
 
   // The blur sigma and radius have a linear relationship which is defined
   // host-side, but both are useful controls here. Sigma (pixels per standard
   // deviation) is used to define the gaussian function itself, whereas the
   // radius is used to limit how much of the function is integrated.
-  float blur_sigma;
-  float blur_radius;
+  float16_t blur_sigma;
+  float16_t blur_radius;
 }
 blur_info;
 
 #if ENABLE_ALPHA_MASK
-uniform sampler2D alpha_mask_sampler;
+uniform f16sampler2D alpha_mask_sampler;
 
 uniform MaskInfo {
-  float src_factor;
-  float inner_blur_factor;
-  float outer_blur_factor;
+  float16_t src_factor;
+  float16_t inner_blur_factor;
+  float16_t outer_blur_factor;
 }
 mask_info;
 #endif
 
-vec4 Sample(sampler2D tex, vec2 coords) {
+f16vec4 Sample(f16sampler2D tex, vec2 coords) {
 #if ENABLE_DECAL_SPECIALIZATION
-  return IPSampleDecal(tex, coords);
+  return IPHalfSampleDecal(tex, coords);
 #else
   return texture(tex, coords);
 #endif
@@ -55,31 +54,35 @@ vec4 Sample(sampler2D tex, vec2 coords) {
 in vec2 v_texture_coords;
 in vec2 v_src_texture_coords;
 
-out vec4 frag_color;
+out f16vec4 frag_color;
 
 void main() {
-  vec4 total_color = vec4(0);
-  float gaussian_integral = 0;
-  vec2 blur_uv_offset = blur_info.blur_direction / blur_info.texture_size;
+  f16vec4 total_color = f16vec4(0.0hf);
+  float16_t gaussian_integral = 0.0hf;
 
-  for (float i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) {
-    float gaussian = IPGaussian(i, blur_info.blur_sigma);
+  for (float16_t i = -blur_info.blur_radius; i <= blur_info.blur_radius; i++) {
+    float16_t gaussian = IPGaussian(i, blur_info.blur_sigma);
     gaussian_integral += gaussian;
     total_color +=
-        gaussian *
-        Sample(texture_sampler,                       // sampler
-               v_texture_coords + blur_uv_offset * i  // texture coordinates
-        );
+        gaussian * Sample(texture_sampler,  // sampler
+                          v_texture_coords + blur_info.blur_uv_offset *
+                                                 i  // texture coordinates
+                   );
   }
 
   frag_color = total_color / gaussian_integral;
 
 #if ENABLE_ALPHA_MASK
-  vec4 src_color = Sample(alpha_mask_sampler,   // sampler
-                          v_src_texture_coords  // texture coordinates
+  f16vec4 src_color = Sample(alpha_mask_sampler,   // sampler
+                             v_src_texture_coords  // texture coordinates
   );
-  float blur_factor = mask_info.inner_blur_factor * float(src_color.a > 0) +
-                      mask_info.outer_blur_factor * float(src_color.a == 0);
+
+  float16_t blur_factor;
+  if (src_color.a > 0.0hf) {
+    blur_factor = mask_info.inner_blur_factor;
+  } else if (src_color.a == 0.0hf) {
+    blur_factor = mask_info.outer_blur_factor;
+  }
 
   frag_color = frag_color * blur_factor + src_color * mask_info.src_factor;
 #endif
diff --git a/impeller/entity/shaders/rrect_blur.frag b/impeller/entity/shaders/rrect_blur.frag
index 5b0ddff80976c..0369ef7e93bbf 100644
--- a/impeller/entity/shaders/rrect_blur.frag
+++ b/impeller/entity/shaders/rrect_blur.frag
@@ -6,58 +6,61 @@
 #include <impeller/types.glsl>
 
 uniform FragInfo {
-  vec4 color;
-  float blur_sigma;
-  vec2 rect_size;
-  float corner_radius;
+  f16vec4 color;
+  f16vec2 rect_size;
+  float16_t blur_sigma;
+  float16_t corner_radius;
 }
 frag_info;
 
 in vec2 v_position;
 
-out vec4 frag_color;
+out f16vec4 frag_color;
 
 const int kSampleCount = 4;
 
-float RRectDistance(vec2 sample_position, vec2 half_size) {
-  vec2 space = abs(sample_position) - half_size + frag_info.corner_radius;
-  return length(max(space, 0.0)) + min(max(space.x, space.y), 0.0) -
-         frag_info.corner_radius;
+float16_t RRectDistance(f16vec2 sample_position, f16vec2 half_size) {
+  f16vec2 space = abs(sample_position) - half_size + frag_info.corner_radius;
+  return length(max(space, float16_t(0.0hf))) +
+         min(max(space.x, space.y), float16_t(0.0hf)) - frag_info.corner_radius;
 }
 
 /// Closed form unidirectional rounded rect blur mask solution using the
 /// analytical Gaussian integral (with approximated erf).
-float RRectShadowX(vec2 sample_position, vec2 half_size) {
+float16_t RRectShadowX(f16vec2 sample_position, f16vec2 half_size) {
   // Compute the X direction distance field (not incorporating the Y distance)
   // for the rounded rect.
-  float space =
-      min(0, half_size.y - frag_info.corner_radius - abs(sample_position.y));
-  float rrect_distance =
+  float16_t space =
+      min(float16_t(0.0hf),
+          half_size.y - frag_info.corner_radius - abs(sample_position.y));
+  float16_t rrect_distance =
       half_size.x - frag_info.corner_radius +
-      sqrt(max(0, frag_info.corner_radius * frag_info.corner_radius -
-                      space * space));
+      sqrt(max(
+          float16_t(0.0hf),
+          frag_info.corner_radius * frag_info.corner_radius - space * space));
 
   // Map the linear distance field to the approximate Gaussian integral.
-  vec2 integral = IPVec2FastGaussianIntegral(
-      sample_position.x + vec2(-rrect_distance, rrect_distance),
+  f16vec2 integral = IPVec2FastGaussianIntegral(
+      sample_position.x + f16vec2(-rrect_distance, rrect_distance),
       frag_info.blur_sigma);
   return integral.y - integral.x;
 }
 
-float RRectShadow(vec2 sample_position, vec2 half_size) {
+float16_t RRectShadow(f16vec2 sample_position, f16vec2 half_size) {
   // Limit the sampling range to 3 standard deviations in the Y direction from
   // the kernel center to incorporate 99.7% of the color contribution.
-  float half_sampling_range = frag_info.blur_sigma * 3;
+  float16_t half_sampling_range = frag_info.blur_sigma * 3.0hf;
 
-  float begin_y = max(-half_sampling_range, sample_position.y - half_size.y);
-  float end_y = min(half_sampling_range, sample_position.y + half_size.y);
-  float interval = (end_y - begin_y) / kSampleCount;
+  float16_t begin_y =
+      max(-half_sampling_range, sample_position.y - half_size.y);
+  float16_t end_y = min(half_sampling_range, sample_position.y + half_size.y);
+  float16_t interval = (end_y - begin_y) / float16_t(kSampleCount);
 
   // Sample the X blur kSampleCount times, weighted by the Gaussian function.
-  float result = 0;
+  float16_t result = 0.0hf;
   for (int sample_i = 0; sample_i < kSampleCount; sample_i++) {
-    float y = begin_y + interval * (sample_i + 0.5);
-    result += RRectShadowX(vec2(sample_position.x, sample_position.y - y),
+    float16_t y = begin_y + interval * (float16_t(sample_i) + 0.5hf);
+    result += RRectShadowX(f16vec2(sample_position.x, sample_position.y - y),
                            half_size) *
               IPGaussian(y, frag_info.blur_sigma) * interval;
   }
@@ -68,10 +71,10 @@ float RRectShadow(vec2 sample_position, vec2 half_size) {
 void main() {
   frag_color = frag_info.color;
 
-  vec2 half_size = frag_info.rect_size * 0.5;
-  vec2 sample_position = v_position - half_size;
+  f16vec2 half_size = frag_info.rect_size * 0.5hf;
+  f16vec2 sample_position = f16vec2(v_position) - half_size;
 
-  if (frag_info.blur_sigma > 0) {
+  if (frag_info.blur_sigma > 0.0hf) {
     frag_color *= RRectShadow(sample_position, half_size);
   } else {
     frag_color *= -RRectDistance(sample_position, half_size);
diff --git a/impeller/tools/malioc.json b/impeller/tools/malioc.json
index 6c9fc4d8546d8..da75088513d82 100644
--- a/impeller/tools/malioc.json
+++ b/impeller/tools/malioc.json
@@ -1440,7 +1440,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 5,
+          "fp16_arithmetic": 44,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -1448,8 +1448,8 @@
               "arith_fma"
             ],
             "longest_path_cycles": [
-              0.8125,
-              0.8125,
+              0.875,
+              0.875,
               0.203125,
               0.25,
               0.0,
@@ -1470,8 +1470,8 @@
               "arith_fma"
             ],
             "shortest_path_cycles": [
-              0.8125,
-              0.8125,
+              0.875,
+              0.875,
               0.203125,
               0.25,
               0.0,
@@ -1483,8 +1483,8 @@
               "arith_fma"
             ],
             "total_cycles": [
-              0.8125,
-              0.8125,
+              0.875,
+              0.875,
               0.203125,
               0.25,
               0.0,
@@ -1495,7 +1495,7 @@
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
           "uniform_registers_used": 12,
-          "work_registers_used": 22
+          "work_registers_used": 18
         }
       }
     }
@@ -3276,7 +3276,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 55,
+          "fp16_arithmetic": 53,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -3305,9 +3305,9 @@
               "arith_cvt"
             ],
             "shortest_path_cycles": [
-              0.3125,
-              0.09375,
-              0.3125,
+              0.328125,
+              0.078125,
+              0.328125,
               0.25,
               0.0,
               0.25,
@@ -3318,9 +3318,9 @@
               "arith_cvt"
             ],
             "total_cycles": [
-              0.515625,
-              0.265625,
-              0.515625,
+              0.578125,
+              0.25,
+              0.578125,
               0.5,
               0.0,
               0.5,
@@ -3348,7 +3348,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 47,
+          "fp16_arithmetic": 45,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -3377,9 +3377,9 @@
               "texture"
             ],
             "shortest_path_cycles": [
-              0.15625,
-              0.09375,
-              0.15625,
+              0.1875,
+              0.078125,
+              0.1875,
               0.0625,
               0.0,
               0.25,
@@ -3390,9 +3390,9 @@
               "texture"
             ],
             "total_cycles": [
-              0.265625,
-              0.265625,
+              0.34375,
               0.25,
+              0.34375,
               0.125,
               0.0,
               0.5,
@@ -3402,7 +3402,7 @@
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
           "uniform_registers_used": 14,
-          "work_registers_used": 15
+          "work_registers_used": 14
         }
       }
     }
@@ -5803,7 +5803,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 10,
+          "fp16_arithmetic": 86,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -5811,9 +5811,9 @@
               "arith_fma"
             ],
             "longest_path_cycles": [
-              0.8125,
-              0.8125,
-              0.234375,
+              0.90625,
+              0.90625,
+              0.265625,
               0.25,
               0.0,
               0.25,
@@ -5833,9 +5833,9 @@
               "arith_fma"
             ],
             "shortest_path_cycles": [
-              0.8125,
-              0.8125,
-              0.203125,
+              0.90625,
+              0.90625,
+              0.234375,
               0.25,
               0.0,
               0.25,
@@ -5846,9 +5846,9 @@
               "arith_fma"
             ],
             "total_cycles": [
-              0.8125,
-              0.8125,
-              0.234375,
+              0.90625,
+              0.90625,
+              0.265625,
               0.25,
               0.0,
               0.25,
@@ -5857,8 +5857,8 @@
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 10,
-          "work_registers_used": 32
+          "uniform_registers_used": 12,
+          "work_registers_used": 29
         }
       }
     },
@@ -5903,7 +5903,7 @@
           },
           "thread_occupancy": 100,
           "uniform_registers_used": 1,
-          "work_registers_used": 3
+          "work_registers_used": 2
         }
       }
     }
@@ -6633,7 +6633,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 63,
+          "fp16_arithmetic": 66,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -6665,7 +6665,7 @@
             ],
             "shortest_path_cycles": [
               0.25,
-              0.171875,
+              0.125,
               0.25,
               0.25,
               0.0,
@@ -6674,14 +6674,12 @@
             ],
             "total_bound_pipelines": [
               "arith_total",
-              "arith_sfu",
-              "varying",
-              "texture"
+              "arith_cvt"
             ],
             "total_cycles": [
-              0.5,
-              0.359375,
-              0.484375,
+              0.53125,
+              0.328125,
+              0.53125,
               0.5,
               0.0,
               0.5,
@@ -6690,7 +6688,7 @@
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 10,
+          "uniform_registers_used": 12,
           "work_registers_used": 21
         }
       }
@@ -6721,7 +6719,7 @@
               "arithmetic"
             ],
             "shortest_path_cycles": [
-              4.619999885559082,
+              3.299999952316284,
               2.0,
               0.0
             ],
@@ -6729,13 +6727,13 @@
               "arithmetic"
             ],
             "total_cycles": [
-              8.666666984558105,
+              7.666666507720947,
               2.0,
               2.0
             ]
           },
           "thread_occupancy": 100,
-          "uniform_registers_used": 2,
+          "uniform_registers_used": 1,
           "work_registers_used": 4
         }
       }
@@ -6754,7 +6752,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 58,
+          "fp16_arithmetic": 61,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -6783,9 +6781,9 @@
               "texture"
             ],
             "shortest_path_cycles": [
-              0.171875,
-              0.171875,
-              0.109375,
+              0.140625,
+              0.125,
+              0.140625,
               0.0625,
               0.0,
               0.25,
@@ -6796,9 +6794,9 @@
               "texture"
             ],
             "total_cycles": [
-              0.359375,
-              0.359375,
-              0.234375,
+              0.328125,
+              0.328125,
+              0.328125,
               0.125,
               0.0,
               0.5,
@@ -6838,7 +6836,7 @@
               "arithmetic"
             ],
             "shortest_path_cycles": [
-              3.299999952316284,
+              2.309999942779541,
               2.0,
               1.0
             ],
@@ -6846,14 +6844,14 @@
               "arithmetic"
             ],
             "total_cycles": [
-              5.333333492279053,
+              5.0,
               2.0,
               2.0
             ]
           },
           "thread_occupancy": 100,
-          "uniform_registers_used": 2,
-          "work_registers_used": 4
+          "uniform_registers_used": 1,
+          "work_registers_used": 3
         }
       }
     }
@@ -6871,7 +6869,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 61,
+          "fp16_arithmetic": 70,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -6897,12 +6895,13 @@
             ],
             "shortest_path_bound_pipelines": [
               "arith_total",
-              "arith_cvt"
+              "arith_cvt",
+              "arith_sfu"
             ],
             "shortest_path_cycles": [
-              0.078125,
-              0.046875,
-              0.078125,
+              0.0625,
+              0.03125,
+              0.0625,
               0.0625,
               0.0,
               0.0,
@@ -6915,7 +6914,7 @@
             "total_cycles": [
               0.3125,
               0.234375,
-              0.296875,
+              0.28125,
               0.3125,
               0.0,
               0.25,
@@ -6955,7 +6954,7 @@
               "arithmetic"
             ],
             "shortest_path_cycles": [
-              2.9700000286102295,
+              1.649999976158142,
               1.0,
               0.0
             ],
@@ -6963,7 +6962,7 @@
               "arithmetic"
             ],
             "total_cycles": [
-              6.666666507720947,
+              5.0,
               1.0,
               1.0
             ]
@@ -6988,7 +6987,7 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 57,
+          "fp16_arithmetic": 66,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
@@ -7014,12 +7013,13 @@
             ],
             "shortest_path_bound_pipelines": [
               "arith_total",
-              "arith_cvt"
+              "arith_cvt",
+              "arith_sfu"
             ],
             "shortest_path_cycles": [
-              0.078125,
-              0.046875,
-              0.078125,
+              0.0625,
+              0.03125,
+              0.0625,
               0.0625,
               0.0,
               0.0,
@@ -7032,7 +7032,7 @@
             "total_cycles": [
               0.234375,
               0.234375,
-              0.203125,
+              0.1875,
               0.125,
               0.0,
               0.25,
@@ -7042,7 +7042,7 @@
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
           "uniform_registers_used": 8,
-          "work_registers_used": 20
+          "work_registers_used": 19
         }
       }
     },
@@ -7072,7 +7072,7 @@
               "arithmetic"
             ],
             "shortest_path_cycles": [
-              2.309999942779541,
+              1.649999976158142,
               1.0,
               0.0
             ],
@@ -7080,14 +7080,14 @@
               "arithmetic"
             ],
             "total_cycles": [
-              4.333333492279053,
+              3.6666667461395264,
               1.0,
               1.0
             ]
           },
           "thread_occupancy": 100,
           "uniform_registers_used": 1,
-          "work_registers_used": 4
+          "work_registers_used": 2
         }
       }
     }
@@ -8915,17 +8915,17 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 33,
+          "fp16_arithmetic": 68,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "arith_total",
-              "arith_fma"
+              "arith_sfu"
             ],
             "longest_path_cycles": [
-              1.5125000476837158,
-              1.5125000476837158,
-              0.546875,
+              1.5,
+              1.3875000476837158,
+              0.737500011920929,
               1.5,
               0.0,
               0.125,
@@ -8955,12 +8955,12 @@
             ],
             "total_bound_pipelines": [
               "arith_total",
-              "arith_fma"
+              "arith_sfu"
             ],
             "total_cycles": [
-              1.6375000476837158,
-              1.6375000476837158,
-              0.578125,
+              1.5625,
+              1.5125000476837158,
+              0.762499988079071,
               1.5625,
               0.0,
               0.125,
@@ -8969,7 +8969,7 @@
           },
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
-          "uniform_registers_used": 20,
+          "uniform_registers_used": 16,
           "work_registers_used": 32
         }
       }
@@ -8984,12 +8984,12 @@
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
-              null
+              "arithmetic"
             ],
             "longest_path_cycles": [
-              null,
-              null,
-              null
+              22.110000610351562,
+              1.0,
+              0.0
             ],
             "pipelines": [
               "arithmetic",
@@ -9008,14 +9008,14 @@
               "arithmetic"
             ],
             "total_cycles": [
-              10.666666984558105,
+              10.0,
               1.0,
               0.0
             ]
           },
           "thread_occupancy": 100,
-          "uniform_registers_used": 1,
-          "work_registers_used": 4
+          "uniform_registers_used": 2,
+          "work_registers_used": 3
         }
       }
     }
@@ -12265,17 +12265,17 @@
       "uses_late_zs_update": false,
       "variants": {
         "Main": {
-          "fp16_arithmetic": 37,
+          "fp16_arithmetic": 65,
           "has_stack_spilling": false,
           "performance": {
             "longest_path_bound_pipelines": [
               "arith_total",
-              "arith_fma"
+              "arith_sfu"
             ],
             "longest_path_cycles": [
-              1.5499999523162842,
-              1.5499999523162842,
-              0.515625,
+              1.5,
+              1.4249999523162842,
+              0.699999988079071,
               1.5,
               0.0,
               0.125,
@@ -12305,12 +12305,12 @@
             ],
             "total_bound_pipelines": [
               "arith_total",
-              "arith_fma"
+              "arith_sfu"
             ],
             "total_cycles": [
-              1.6749999523162842,
-              1.6749999523162842,
-              0.5625,
+              1.5625,
+              1.5499999523162842,
+              0.75,
               1.5625,
               0.0,
               0.125,
@@ -12320,7 +12320,7 @@
           "stack_spill_bytes": 0,
           "thread_occupancy": 100,
           "uniform_registers_used": 18,
-          "work_registers_used": 32
+          "work_registers_used": 31
         }
       }
     }