Add multisampled antialiasing

This is ported from the multi branch. Configuration of antialiasing mode is currently set statically, but could become more dynamic. In addition, the mask LUT is computed and uploaded every frame, rather than being persistent.
linebender · Oct 11, 2023 · 2f36411 · 2f36411
1 parent 9bdbb10
commit 2f36411
Show file tree

Hide file tree

Showing 6 changed files with 522 additions and 39 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -16,6 +16,6 @@
   },
   "wgsl-analyzer.diagnostics.nagaVersion": "main",
   "wgsl-analyzer.preprocessor.shaderDefs": [
-    "full"
+    "full", "msaa16", "msaa"
   ]
 }
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
@@ -2,8 +2,10 @@
 
 // Fine rasterizer. This can run in simple (just path rendering) and full
 // modes, controllable by #define.
+//
+// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
+// or msaa16.
 
-// This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
     segments: u32,
@@ -18,8 +20,6 @@ var<uniform> config: Config;
 @group(0) @binding(1)
 var<storage> segments: array<Segment>;
 
-#ifdef full
-
 #import blend
 #import ptcl
 
@@ -40,6 +40,309 @@ var gradients: texture_2d<f32>;
 @group(0) @binding(6)
 var image_atlas: texture_2d<f32>;
 
+#ifdef msaa8
+let MASK_WIDTH = 32u;
+let MASK_HEIGHT = 32u;
+let SH_SAMPLES_SIZE = 256u;
+let SAMPLE_WORDS_PER_PIXEL = 1u;
+// This might be better in uniform, but that has 16 byte alignment
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 256u>;
+#endif
+
+#ifdef msaa16
+let MASK_WIDTH = 64u;
+let MASK_HEIGHT = 64u;
+let SH_SAMPLES_SIZE = 512u;
+let SAMPLE_WORDS_PER_PIXEL = 2u;
+@group(0) @binding(7)
+var<storage> mask_lut: array<u32, 2048u>;
+#endif
+
+#ifdef msaa
+let WG_SIZE = 64u;
+var<workgroup> sh_count: array<u32, WG_SIZE>;
+
+// This is 8 winding numbers packed to a u32, 4 bits per sample
+var<workgroup> sh_winding: array<atomic<u32>, 32u>;
+// Same packing, one group of 8 per pixel
+var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
+// Same packing, accumulating winding numbers for vertical edge crossings
+var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;
+
+// number of integer cells spanned by interval defined by a, b
+fn span(a: f32, b: f32) -> u32 {
+    return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
+}
+
+let SEG_SIZE = 5u;
+
+// New multisampled algorithm.
+fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
+    let n_segs = fill.size_and_rule >> 1u;
+    let even_odd = (fill.size_and_rule & 1u) != 0u;
+    let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
+    let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
+    if th_ix < 32u {
+        if th_ix < 2u {
+            atomicStore(&sh_winding_y[th_ix], 0x88888888u);
+        }
+        atomicStore(&sh_winding[th_ix], 0x88888888u);
+    }
+    let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
+    for (var i = 0u; i < sample_count; i++) {
+        atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
+    }
+    workgroupBarrier();
+    let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
+    for (var batch = 0u; batch < n_batch; batch++) {
+        let seg_ix = batch * WG_SIZE + th_ix;
+        let seg_off = fill.seg_data + seg_ix;
+        var count = 0u;
+        let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
+        // TODO: might save a register rewriting this in terms of limit
+        if th_ix < slice_size {
+            let segment = segments[seg_off];
+            // Note: coords relative to tile origin probably a good idea in coarse path,
+            // especially as f16 would work. But keeping existing scheme for compatibility.
+            let xy0 = segment.origin - tile_origin;
+            let xy1 = xy0 + segment.delta;
+            var y_edge_f = f32(TILE_HEIGHT);
+            var delta = select(-1, 1, xy1.x <= xy0.x);
+            if xy0.x == 0.0 && xy1.x == 0.0 {
+                if xy0.y == 0.0 {
+                    y_edge_f = 0.0;
+                } else if xy1.y == 0.0 {
+                    y_edge_f = 0.0;
+                    delta = -delta;
+                }
+            } else {
+                if xy0.x == 0.0 {
+                    if xy0.y != 0.0 {
+                        y_edge_f = xy0.y;
+                    }
+                } else if xy1.x == 0.0 && xy1.y != 0.0 {
+                    y_edge_f = xy1.y;
+                }
+                // discard horizontal lines aligned to pixel grid
+                if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
+                    count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
+                }
+            }
+            let y_edge = u32(ceil(y_edge_f));
+            if y_edge < TILE_HEIGHT {
+                atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
+            }
+        }
+        // workgroup prefix sum of counts
+        sh_count[th_ix] = count;
+        let lg_n = firstLeadingBit(slice_size * 2u - 1u);
+        for (var i = 0u; i < lg_n; i++) {
+            workgroupBarrier();
+            if th_ix >= 1u << i {
+                count += sh_count[th_ix - (1u << i)];
+            }
+            workgroupBarrier();
+            sh_count[th_ix] = count;
+        }
+#ifdef have_uniform
+        let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
+#else
+        workgroupBarrier();
+        let total = sh_count[slice_size - 1u];
+#endif
+        for (var i = th_ix; i < total; i += WG_SIZE) {
+            // binary search to find pixel
+            var lo = 0u;
+            var hi = slice_size;
+            let goal = i;
+            while hi > lo + 1u {
+                let mid = (lo + hi) >> 1u;
+                if goal >= sh_count[mid - 1u] {
+                    lo = mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            let el_ix = lo;
+            let last_pixel = i + 1u == sh_count[el_ix];
+            let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
+            let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
+            let segment = segments[seg_off];
+            let xy0_in = segment.origin - tile_origin;
+            let xy1_in = xy0_in + segment.delta;
+            let is_down = xy1_in.y >= xy0_in.y;
+            let xy0 = select(xy1_in, xy0_in, is_down);
+            let xy1 = select(xy0_in, xy1_in, is_down);
+
+            // Set up data for line rasterization
+            // Note: this is duplicated work if total count exceeds a workgroup.
+            // One alternative is to compute it in a separate dispatch.
+            let dx = abs(xy1.x - xy0.x);
+            let dy = xy1.y - xy0.y;
+            let dy_dxdy = dy / (dx + dy);
+            let a = dx / (dx + dy);
+            let is_positive_slope = xy1.x >= xy0.x;
+            let sign = select(-1.0, 1.0, is_positive_slope);
+            let xt0 = floor(xy0.x * sign);
+            let c = xy0.x * sign - xt0;
+            // This has a special case in the JS code, but we should just not render
+            let y0i = floor(xy0.y);
+            let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
+            let b = dy_dxdy * c + a * (ytop - xy0.y);
+            let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
+            // Use line equation to plot pixel coordinates
+
+            let zf = a * f32(sub_ix) + b;
+            let z = floor(zf);
+            let x = x0i + i32(sign * z);
+            let y = i32(y0i) + i32(sub_ix) - i32(z);
+            var is_delta: bool;
+            // We need to adjust winding number if slope is positive and there
+            // is a crossing at the left edge of the pixel.
+            var is_bump = false;
+            let zp = floor(a * f32(sub_ix - 1u) + b);
+            if sub_ix == 0u {
+                is_delta = y0i == xy0.y && y0i != xy1.y;
+                is_bump = xy0.x == 0.0;
+            } else {
+                is_delta = z == zp;
+                is_bump = is_positive_slope && !is_delta;
+            }
+            let pix_ix = u32(y) * TILE_WIDTH + u32(x);
+            if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
+                let delta_pix = pix_ix + 1u;
+                if is_delta {
+                    let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
+                    atomicAdd(&sh_winding[delta_pix >> 3u], delta);
+                }
+            }
+            // Apply sample mask
+            let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
+            let half_height = f32(MASK_HEIGHT / 2u);
+            let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
+            let mask_col = floor((zf - z) * f32(MASK_WIDTH));
+            let mask_ix = mask_block + u32(mask_row + mask_col);
+#ifdef msaa8
+            var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
+            mask &= 0xffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
+                mask &= 0xffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffu << mask_shift);
+            }
+            let mask_a = mask | (mask << 6u);
+            let mask_b = mask_a | (mask_a << 12u);
+            let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
+            var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
+            if is_bump {
+                mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
+            }
+            atomicAdd(&sh_samples[pix_ix], mask_signed);
+#endif
+#ifdef msaa16
+            var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
+            mask &= 0xffffu;
+            // Intersect with y half-plane masks
+            if sub_ix == 0u && !is_bump {
+                let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
+                mask &= 0xffffu << mask_shift;
+            }
+            if last_pixel && xy1.x != 0.0 {
+                let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
+                mask &= ~(0xffffu << mask_shift);
+            }
+            let mask0 = mask & 0xffu;
+            let mask0_a = mask0 | (mask0 << 6u);
+            let mask0_b = mask0_a | (mask0_a << 12u);
+            let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
+            var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
+            let mask1 = (mask >> 8u) & 0xffu;
+            let mask1_a = mask1 | (mask1 << 6u);
+            let mask1_b = mask1_a | (mask1_a << 12u);
+            let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
+            var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
+            if is_bump {
+                let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
+                mask0_signed += bump_delta;
+                mask1_signed += bump_delta;
+            }
+            atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
+            atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
+#endif
+        }
+        workgroupBarrier();
+    }
+    var area: array<f32, PIXELS_PER_THREAD>;
+    let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
+    var packed_w = atomicLoad(&sh_winding[major]);
+    // Prefix sum of packed 4 bit values within u32
+    packed_w += (packed_w - 0x8888888u) << 4u;
+    packed_w += (packed_w - 0x888888u) << 8u;
+    packed_w += (packed_w - 0x8888u) << 16u;
+    // Note: could probably do bias in one go, but it would be inscrutable
+    if (major & 1u) != 0u {
+        // We could use shmem to communicate the value from another thread;
+        // if we had subgroups that would almost certainly be the most
+        // efficient way. But we just calculate again for simplicity.
+        var last_packed = atomicLoad(&sh_winding[major - 1u]);
+        last_packed += (last_packed - 0x8888888u) << 4u;
+        last_packed += (last_packed - 0x888888u) << 8u;
+        last_packed += (last_packed - 0x8888u) << 16u;
+        let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
+        packed_w += bump;
+    }
+    var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
+    packed_y += (packed_y - 0x8888888u) << 4u;
+    packed_y += (packed_y - 0x888888u) << 8u;
+    packed_y += (packed_y - 0x8888u) << 16u;
+    if th_ix == 0u {
+        atomicStore(&sh_winding_y[0], packed_y);        
+    }
+    workgroupBarrier();
+    var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
+    if local_id.y >= 8u {
+        wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
+    }
+
+    for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
+        let pix_ix = th_ix * PIXELS_PER_THREAD + i;
+        let minor = pix_ix & 7u;
+        //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
+        // TODO: math might be off here
+        let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
+        if expected_zero >= 16u {
+            area[i] = 1.0;
+        } else {
+#ifdef msaa8
+            let samples = atomicLoad(&sh_samples[pix_ix]);
+            let xored = (expected_zero * 0x11111111u) ^ samples;
+            // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
+            let xored2 = xored | (xored * 2u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
+#endif
+#ifdef msaa16
+            let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
+            let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
+            let xored0 = (expected_zero * 0x11111111u) ^ samples0;
+            let xored0_2 = xored0 | (xored0 * 2u);
+            let xored1 = (expected_zero * 0x11111111u) ^ samples1;
+            let xored1_2 = xored1 | (xored1 >> 1u);
+            let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
+            let xored4 = xored2 | (xored2 * 4u);
+            area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
+#endif
+        }
+    }
+    return area;
+}
+#endif
+
 fn read_fill(cmd_ix: u32) -> CmdFill {
     let size_and_rule = ptcl[cmd_ix + 1u];
     let seg_data = ptcl[cmd_ix + 2u];
@@ -126,15 +429,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
     }
 }
 
-#else
-
-@group(0) @binding(3)
-var output: texture_storage_2d<r8, write>;
-
-#endif
-
 let PIXELS_PER_THREAD = 4u;
 
+// Analytic area antialiasing.
+//
+// This is currently dead code if msaa is enabled, but it would be fairly straightforward
+// to wire this so it's a dynamic choice (even per-path).
 fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
     let n_segs = fill.size_and_rule >> 1u;
     let even_odd = (fill.size_and_rule & 1u) != 0u;
@@ -220,7 +520,11 @@ fn main(
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
+#ifdef msaa
+                area = fill_path_ms(fill, wg_id.xy, local_id.xy);
+#else
                 area = fill_path(fill, xy);
+#endif
                 cmd_ix += 4u;
             }
             // CMD_STROKE

diff --git a/src/lib.rs b/src/lib.rs
@@ -17,6 +17,7 @@
 mod cpu_dispatch;
 mod cpu_shader;
 mod engine;
+mod mask;
 mod render;
 mod scene;
 mod shaders;
@@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
 /// Specialization of `Result` for our catch-all error type.
 pub type Result<T> = std::result::Result<T, Error>;
 
+/// Possible configurations for antialiasing.
+#[derive(PartialEq, Eq)]
+#[allow(unused)]
+enum AaConfig {
+    Area,
+    Msaa8,
+    Msaa16,
+}
+
+/// Configuration of antialiasing. Currently this is static, but could be switched to
+/// a launch option or even finer-grained.
+const ANTIALIASING: AaConfig = AaConfig::Msaa16;
+
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
 pub struct Renderer {