Skip to content

Commit

Permalink
Move vector normalization to path_coarse.comp
Browse files Browse the repository at this point in the history
Performing the normalization in path_coarse.comp reduces redundant
computation as each thread in SIMD computes for a different path.
Gives a slight overall performance boost.

The normal vector direction was flipped to simplify logic.
  • Loading branch information
ishitatsuyuki committed Apr 11, 2021
1 parent 16177b4 commit e8cb560
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 27 deletions.
1 change: 1 addition & 0 deletions piet-gpu-types/src/tile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ piet_gpu! {
struct TileSeg {
origin: [f32; 2],
vector: [f32; 2],
len: f32,
y_edge: f32,
next: Ref<TileSeg>,
}
Expand Down
19 changes: 6 additions & 13 deletions piet-gpu/shader/kernel4.comp
Original file line number Diff line number Diff line change
Expand Up @@ -185,21 +185,18 @@ void main() {
}
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
float len2 = dot(seg.vector, seg.vector);
if (len2 > 0.) {
if (seg.len > 0.) {
// Compute the stroke area with a rectanglar implicit test.
float rlen = inversesqrt(len2);
float len = rlen * len2;
vec2 u = seg.vector * rlen;
vec2 n = vec2(-u.y, u.x);
vec2 u = seg.vector;
vec2 n = vec2(u.y, -u.x);
for (uint k = 0; k < CHUNK; k++) {
if (stencil[k] == ~0u) continue;
vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 dpos = seg.origin - my_xy - vec2(0.5, 0.5);
float kp = dot(dpos, n);
uint par = getLut(n, kp + stroke.half_width) ^ getLut(n, kp - stroke.half_width);
float ko = dot(dpos, u);
uint ortho = getLut(u, ko) ^ getLut(u, ko + len);
uint ortho = getLut(u, ko) ^ getLut(u, ko + seg.len);
coverage[k] |= par & ortho;
}
}
Expand All @@ -214,16 +211,12 @@ void main() {
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
vec2 o = vec2(0.5, 0.5);
vec2 n = normalize(vec2(-seg.vector.y, seg.vector.x));
// Make sure that the normal vector points the right side (we want the area to the right of the line).
// The code below flips n if n.x < 0.
uint signBit = floatBitsToUint(n.x) & 0x80000000u;
n = vec2(abs(n.x), uintBitsToFloat(floatBitsToUint(n.y) ^ signBit));
vec2 n = vec2(seg.vector.y, -seg.vector.x);
for (uint k = 0; k < CHUNK; k++) {
if (stencil[k] == ~0u) continue;
vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 start = seg.origin - my_xy;
vec2 end = start + seg.vector;
vec2 end = start + seg.vector * seg.len;
// The horizontal ray test is calculated with lookup tables as an AND of three half-planes:
// Two of them are used to confine the range of y, and this is implemented using a logically equivalent
// method with XOR. While the left edge is already clipped to tile boundary (see path_corase.comp), there
Expand Down
Binary file modified piet-gpu/shader/kernel4.spv
Binary file not shown.
30 changes: 21 additions & 9 deletions piet-gpu/shader/path_coarse.comp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,17 @@ void main() {
xx0 = clamp(xx0, x0, x1);
xx1 = clamp(xx1, x0, x1);

vec2 v = p1 - p0;
float len2 = dot(v, v);
float rlen = inversesqrt(len2);
vec2 normalized = v * rlen;
// Make sure that the normal vector points the right side (we want the area to the right of the line).
// The code below flips the vector and length if v.y < 0.
// Note that the normal vector is rotated CW by right angle.
uint signBit = floatBitsToUint(normalized.y) & 0x80000000u;
normalized = vec2(uintBitsToFloat(floatBitsToUint(normalized.x) ^ signBit), abs(normalized.y));
float len = uintBitsToFloat(floatBitsToUint(rlen * len2) ^ signBit);

for (int x = xx0; x < xx1; x++) {
float tile_x0 = float(x * TILE_WIDTH_PX);
TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
Expand All @@ -246,22 +257,23 @@ void main() {
old = atomicExchange(memory[tile_el], tile_offset);
}
tile_seg.origin = p0;
tile_seg.vector = p1 - p0;
tile_seg.vector = normalized;
tile_seg.len = len;
float y_edge = 0.0;
if (!is_stroke) {
y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
float t = (tile_x0 - p0.x) / dx;
y_edge = mix(p0.y, p1.y, t);
if (min(p0.x, p1.x) < tile_x0) {
vec2 p = vec2(tile_x0, y_edge);
if (p0.x > p1.x) {
tile_seg.vector = p - p0;
tile_seg.len *= t;
} else {
tile_seg.origin = p;
tile_seg.vector = p1 - p;
tile_seg.origin = vec2(tile_x0, y_edge);
tile_seg.len *= (1 - t);
}
// kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
// kernel4 uses sign(len) for the sign of the intersection backdrop.
// Nudge zeroes towards the intended sign.
if (tile_seg.vector.x == 0) {
tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
if (tile_seg.len == 0) {
tile_seg.len = sign(dy)*1e-9;
}
}
if (x <= min_xray || max_xray < x) {
Expand Down
Binary file modified piet-gpu/shader/path_coarse.spv
Binary file not shown.
14 changes: 9 additions & 5 deletions piet-gpu/shader/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ TileRef Tile_index(TileRef ref, uint index) {
struct TileSeg {
vec2 origin;
vec2 vector;
float len;
float y_edge;
TileSegRef next;
};

#define TileSeg_size 24
#define TileSeg_size 28

TileSegRef TileSeg_index(TileSegRef ref, uint index) {
return TileSegRef(ref.offset + index * TileSeg_size);
Expand Down Expand Up @@ -106,11 +107,13 @@ TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
uint raw3 = read_mem(a, ix + 3);
uint raw4 = read_mem(a, ix + 4);
uint raw5 = read_mem(a, ix + 5);
uint raw6 = read_mem(a, ix + 6);
TileSeg s;
s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.y_edge = uintBitsToFloat(raw4);
s.next = TileSegRef(raw5);
s.len = uintBitsToFloat(raw4);
s.y_edge = uintBitsToFloat(raw5);
s.next = TileSegRef(raw6);
return s;
}

Expand All @@ -120,8 +123,9 @@ void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
write_mem(a, ix + 5, s.next.offset);
write_mem(a, ix + 4, floatBitsToUint(s.len));
write_mem(a, ix + 5, floatBitsToUint(s.y_edge));
write_mem(a, ix + 6, s.next.offset);
}

TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) {
Expand Down

0 comments on commit e8cb560

Please sign in to comment.