Move vector normalization to path_coarse.comp

Performing the normalization in path_coarse.comp reduces redundant computation as each thread in SIMD computes for a different path. Gives a slight overall performance boost. The normal vector direction was flipped to simplify logic.
linebender · Apr 11, 2021 · e8cb560 · e8cb560
1 parent 16177b4
commit e8cb560
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 27 deletions.
diff --git a/piet-gpu-types/src/tile.rs b/piet-gpu-types/src/tile.rs
@@ -15,6 +15,7 @@ piet_gpu! {
         struct TileSeg {
             origin: [f32; 2],
             vector: [f32; 2],
+            len: f32,
             y_edge: f32,
             next: Ref<TileSeg>,
         }

diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
@@ -185,21 +185,18 @@ void main() {
             }
             do {
                 TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
-                float len2 = dot(seg.vector, seg.vector);
-                if (len2 > 0.) {
+                if (seg.len > 0.) {
                     // Compute the stroke area with a rectanglar implicit test.
-                    float rlen = inversesqrt(len2);
-                    float len = rlen * len2;
-                    vec2 u = seg.vector * rlen;
-                    vec2 n = vec2(-u.y, u.x);
+                    vec2 u = seg.vector;
+                    vec2 n = vec2(u.y, -u.x);
                     for (uint k = 0; k < CHUNK; k++) {
                         if (stencil[k] == ~0u) continue;
                         vec2 my_xy = xy + vec2(chunk_offset(k));
                         vec2 dpos = seg.origin - my_xy - vec2(0.5, 0.5);
                         float kp = dot(dpos, n);
                         uint par = getLut(n, kp + stroke.half_width) ^ getLut(n, kp - stroke.half_width);
                         float ko = dot(dpos, u);
-                        uint ortho = getLut(u, ko) ^ getLut(u, ko + len);
+                        uint ortho = getLut(u, ko) ^ getLut(u, ko + seg.len);
                         coverage[k] |= par & ortho;
                     }
                 }
@@ -214,16 +211,12 @@ void main() {
             do {
                 TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
                 vec2 o = vec2(0.5, 0.5);
-                vec2 n = normalize(vec2(-seg.vector.y, seg.vector.x));
-                // Make sure that the normal vector points the right side (we want the area to the right of the line).
-                // The code below flips n if n.x < 0.
-                uint signBit = floatBitsToUint(n.x) & 0x80000000u;
-                n = vec2(abs(n.x), uintBitsToFloat(floatBitsToUint(n.y) ^ signBit));
+                vec2 n = vec2(seg.vector.y, -seg.vector.x);
                 for (uint k = 0; k < CHUNK; k++) {
                     if (stencil[k] == ~0u) continue;
                     vec2 my_xy = xy + vec2(chunk_offset(k));
                     vec2 start = seg.origin - my_xy;
-                    vec2 end = start + seg.vector;
+                    vec2 end = start + seg.vector * seg.len;
                     // The horizontal ray test is calculated with lookup tables as an AND of three half-planes:
                     // Two of them are used to confine the range of y, and this is implemented using a logically equivalent
                     // method with XOR. While the left edge is already clipped to tile boundary (see path_corase.comp), there

diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
@@ -237,6 +237,17 @@ void main() {
                     xx0 = clamp(xx0, x0, x1);
                     xx1 = clamp(xx1, x0, x1);
 
+                    vec2 v = p1 - p0;
+                    float len2 = dot(v, v);
+                    float rlen = inversesqrt(len2);
+                    vec2 normalized = v * rlen;
+                    // Make sure that the normal vector points the right side (we want the area to the right of the line).
+                    // The code below flips the vector and length if v.y < 0.
+                    // Note that the normal vector is rotated CW by right angle.
+                    uint signBit = floatBitsToUint(normalized.y) & 0x80000000u;
+                    normalized = vec2(uintBitsToFloat(floatBitsToUint(normalized.x) ^ signBit), abs(normalized.y));
+                    float len = uintBitsToFloat(floatBitsToUint(rlen * len2) ^ signBit);
+
                     for (int x = xx0; x < xx1; x++) {
                         float tile_x0 = float(x * TILE_WIDTH_PX);
                         TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
@@ -246,22 +257,23 @@ void main() {
                             old = atomicExchange(memory[tile_el], tile_offset);
                         }
                         tile_seg.origin = p0;
-                        tile_seg.vector = p1 - p0;
+                        tile_seg.vector = normalized;
+                        tile_seg.len = len;
                         float y_edge = 0.0;
                         if (!is_stroke) {
-                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+                            float t = (tile_x0 - p0.x) / dx;
+                            y_edge = mix(p0.y, p1.y, t);
                             if (min(p0.x, p1.x) < tile_x0) {
-                                vec2 p = vec2(tile_x0, y_edge);
                                 if (p0.x > p1.x) {
-                                    tile_seg.vector = p - p0;
+                                    tile_seg.len *= t;
                                 } else {
-                                    tile_seg.origin = p;
-                                    tile_seg.vector = p1 - p;
+                                    tile_seg.origin = vec2(tile_x0, y_edge);
+                                    tile_seg.len *= (1 - t);
                                 }
-                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
+                                // kernel4 uses sign(len) for the sign of the intersection backdrop.
                                 // Nudge zeroes towards the intended sign.
-                                if (tile_seg.vector.x == 0) {
-                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
+                                if (tile_seg.len == 0) {
+                                    tile_seg.len = sign(dy)*1e-9;
                                 }
                             }
                             if (x <= min_xray || max_xray < x) {

diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h
@@ -43,11 +43,12 @@ TileRef Tile_index(TileRef ref, uint index) {
 struct TileSeg {
     vec2 origin;
     vec2 vector;
+    float len;
     float y_edge;
     TileSegRef next;
 };
 
-#define TileSeg_size 24
+#define TileSeg_size 28
 
 TileSegRef TileSeg_index(TileSegRef ref, uint index) {
     return TileSegRef(ref.offset + index * TileSeg_size);
@@ -106,11 +107,13 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
     uint raw3 = read_mem(a, ix + 3);
     uint raw4 = read_mem(a, ix + 4);
     uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
     TileSeg s;
     s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.y_edge = uintBitsToFloat(raw4);
-    s.next = TileSegRef(raw5);
+    s.len = uintBitsToFloat(raw4);
+    s.y_edge = uintBitsToFloat(raw5);
+    s.next = TileSegRef(raw6);
     return s;
 }
 
@@ -120,8 +123,9 @@ void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
     write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
     write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
     write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
-    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
-    write_mem(a, ix + 5, s.next.offset);
+    write_mem(a, ix + 4, floatBitsToUint(s.len));
+    write_mem(a, ix + 5, floatBitsToUint(s.y_edge));
+    write_mem(a, ix + 6, s.next.offset);
 }
 
 TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) {