Handle transparent objects

linebender · Mar 14, 2021 · b908a32 · b908a32
1 parent 70c24af
commit b908a32
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 33 deletions.
diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
@@ -30,6 +30,8 @@ piet_gpu! {
             EndClip,
             Stroke(CmdStroke),
             Jump(CmdJump),
+            SaveStencil,
+            RestoreStencil,
         }
     }
 }
diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs
@@ -276,13 +276,6 @@ fn main() -> Result<(), Error> {
         println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
         println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
 
-        /*
-        let mut data: Vec<u32> = Default::default();
-        renderer.tile_buf.read(&mut data).unwrap();
-        piet_gpu::dump_k1_data(&data);
-        trace_ptcl(&data);
-        */
-
         let mut img_data: Vec<u8> = Default::default();
         // Note: because png can use a `&[u8]` slice, we could avoid an extra copy
         // (probably passing a slice into a closure). But for now: keep it simple.

diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs
@@ -1,15 +1,17 @@
-use piet_gpu_hal::hub;
-use piet_gpu_hal::vulkan::VkInstance;
-use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
-
-use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
+use std::thread::sleep;
+use std::time::Duration;
 
 use winit::{
     event::{Event, WindowEvent},
     event_loop::{ControlFlow, EventLoop},
     window::WindowBuilder,
 };
 
+use piet_gpu::{HEIGHT, PietGpuRenderContext, render_scene, Renderer, WIDTH};
+use piet_gpu_hal::{CmdBuf, Error, ImageLayout};
+use piet_gpu_hal::hub;
+use piet_gpu_hal::vulkan::VkInstance;
+
 const NUM_FRAMES: usize = 2;
 
 fn main() -> Result<(), Error> {
@@ -118,6 +120,8 @@ fn main() -> Result<(), Error> {
                         .present(image_idx, &[present_semaphores[frame_idx]])
                         .unwrap();
 
+                    sleep(Duration::from_millis(5));
+
                     current_frame += 1;
                 }
                 _ => (),

diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
@@ -87,6 +87,22 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
     return true;
 }
 
+bool alloc_cmd_rev(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset >= cmd_limit) {
+        return true;
+    }
+    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(cmd_ref.offset);
+    cmd_alloc = new_cmd.alloc;
+    cmd_ref = CmdRef(cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
+    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
+    cmd_limit = cmd_alloc.offset + Cmd_size;
+    return true;
+}
+
 void main() {
     if (mem_error != NO_ERROR) {
         return;
@@ -108,9 +124,15 @@ void main() {
     uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
     uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
     uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC * 2, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
     uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    Alloc alpha_cmd_alloc = slice_mem(conf.ptcl_alloc, PTCL_INITIAL_ALLOC * (this_tile_ix * 2), PTCL_INITIAL_ALLOC);
+    CmdRef alpha_cmd_ref = CmdRef(alpha_cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
+    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
+        Cmd_End_write(alpha_cmd_alloc, alpha_cmd_ref);
+    }
+    uint alpha_cmd_limit = alpha_cmd_ref.offset + Cmd_size;
     // The nesting depth of the clip stack
     uint clip_depth = 1;
     // State for the "clip zero" optimization. If it's nonzero, then we are
@@ -312,19 +334,43 @@ void main() {
                     Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
-                    CmdFill cmd_fill;
-                    cmd_fill.tile_ref = tile.tile.offset;
-                    cmd_fill.backdrop = tile.backdrop;
-                    cmd_fill.rgba_color = fill.rgba_color;
-                    Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
-                    if (tile.tile.offset == 0) {
-                        // Anything below is occluded due to drawing front-to-back.
-                        clip_zero_depth = clip_depth;
+                    if (unpackUnorm4x8(fill.rgba_color).wzyx.a == 1.0) {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        CmdFill cmd_fill;
+                        cmd_fill.tile_ref = tile.tile.offset;
+                        cmd_fill.backdrop = tile.backdrop;
+                        cmd_fill.rgba_color = fill.rgba_color;
+                        Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
+                        if (tile.tile.offset == 0) {
+                            // Anything below is occluded due to drawing front-to-back.
+                            clip_zero_depth = clip_depth;
+                        }
+                        cmd_ref.offset += Cmd_size;
+                    } else {
+                        if (!alloc_cmd_rev(alpha_cmd_alloc, alpha_cmd_ref, alpha_cmd_limit)) {
+                            break;
+                        }
+                        alpha_cmd_ref.offset -= Cmd_size;
+                        CmdFill cmd_fill;
+                        cmd_fill.tile_ref = tile.tile.offset;
+                        cmd_fill.backdrop = tile.backdrop;
+                        cmd_fill.rgba_color = fill.rgba_color;
+                        Cmd_Fill_write(alpha_cmd_alloc, alpha_cmd_ref, cmd_fill);
+
+                        if (!alloc_cmd_rev(alpha_cmd_alloc, alpha_cmd_ref, alpha_cmd_limit)) {
+                            break;
+                        }
+                        alpha_cmd_ref.offset -= Cmd_size;
+                        Cmd_RestoreStencil_write(alpha_cmd_alloc, alpha_cmd_ref);
+
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        Cmd_SaveStencil_write(cmd_alloc, cmd_ref);
+                        cmd_ref.offset += Cmd_size;
                     }
-                    cmd_ref.offset += Cmd_size;
                     break;
                 case Annotated_BeginClip:
                     tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
@@ -393,6 +439,7 @@ void main() {
         if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        Cmd_End_write(cmd_alloc, cmd_ref);
+        CmdJump jump = CmdJump(alpha_cmd_ref.offset);
+        Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
     }
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
@@ -50,6 +50,10 @@ uint getLut(vec2 n, float c) {
         c = -c;
         mask = ~0;
     }
+    // It is also possible to implement this in a branchless manner by clamping c to be under 1. It allows more ILP on
+    // at least AMD GPUs (a wait is forcibly inserted when converging the branches so we need to avoid that), but when
+    // testing on RX 5700 XT, it turned out to be slower even though it should hide the latency more effectively. Maybe
+    // it's around the point where both peak bandwidth and latency could become the bottleneck.
     if (c >= 1.) {
         // mask ^= 0;
     } else {
@@ -114,16 +118,19 @@ void main() {
     }
 
     uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC * 2, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
 
     uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
     vec3 rgb[CHUNK];
     uint stencil[CHUNK];
     uint coverage[CHUNK];
-    uint stencil_stack[32][CHUNK];
+    uint stencil_stack[256][CHUNK];
     uint clip_depth = 0;
+    uint alpha_stencil_stack[256][CHUNK];
+    uint alpha_depth = 0;
+
     Alloc clip_tos = new_alloc(0, 0);
     for (uint i = 0; i < CHUNK; i++) {
         stencil[i] = 0;
@@ -165,10 +172,13 @@ void main() {
             fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
             for (uint k = 0; k < CHUNK; k++) {
                 float area = float(bitCount(coverage[k] & ~stencil[k])) / 32.;
-                // TODO: alpha
-                rgb[k] = rgb[k] + fg_rgba.rgb * area;
-                // only cover if opaque...
-                stencil[k] |= coverage[k];
+                if (fg_rgba.a == 1.0) {
+                    rgb[k] = rgb[k] + fg_rgba.rgb * area;
+                    // only update stencil if opaque
+                    stencil[k] |= coverage[k];
+                } else {
+                    rgb[k] = mix(rgb[k], fg_rgba.rgb, fg_rgba.a * area);
+                }
             }
             break;
         case Cmd_BeginClip:
@@ -188,6 +198,18 @@ void main() {
                 stencil[k] = stencil_stack[clip_depth][k];
             }
             break;
+        case Cmd_SaveStencil:
+            for (uint k = 0; k < CHUNK; k++) {
+                alpha_stencil_stack[alpha_depth][k] = stencil[k];
+            }
+            alpha_depth++;
+            break;
+        case Cmd_RestoreStencil:
+            alpha_depth--;
+            for (uint k = 0; k < CHUNK; k++) {
+                stencil[k] = alpha_stencil_stack[alpha_depth][k];
+            }
+            break;
         case Cmd_Jump:
             cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
             cmd_alloc.offset = cmd_ref.offset;

diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
@@ -73,6 +73,8 @@ CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
 #define Cmd_EndClip 3
 #define Cmd_Stroke 4
 #define Cmd_Jump 5
+#define Cmd_SaveStencil 6
+#define Cmd_RestoreStencil 7
 #define Cmd_size 16
 
 CmdRef Cmd_index(CmdRef ref, uint index) {
@@ -194,3 +196,11 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
     CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
 }
 
+void Cmd_SaveStencil_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_SaveStencil);
+}
+
+void Cmd_RestoreStencil_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_RestoreStencil);
+}
+
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
@@ -249,7 +249,7 @@ impl Renderer {
         let bin_base = alloc;
         alloc += ((n_paths + 255) & !255) * BIN_SIZE;
         let ptcl_base = alloc;
-        alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC * 2;
         let pathseg_base = alloc;
         alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3;
         let anno_base = alloc;