Skip to content

Commit

Permalink
Add multisampled antialiasing
Browse files Browse the repository at this point in the history
This is ported from the multi branch.

Configuration of antialiasing mode is currently set statically, but could become more dynamic. In addition, the mask LUT is computed and uploaded every frame, rather than being persistent.
  • Loading branch information
raphlinus committed Oct 11, 2023
1 parent 9bdbb10 commit 2f36411
Show file tree
Hide file tree
Showing 6 changed files with 522 additions and 39 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
},
"wgsl-analyzer.diagnostics.nagaVersion": "main",
"wgsl-analyzer.preprocessor.shaderDefs": [
"full"
"full", "msaa16", "msaa"
]
}
324 changes: 314 additions & 10 deletions shader/fine.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

// Fine rasterizer. This can run in simple (just path rendering) and full
// modes, controllable by #define.
//
// To enable multisampled rendering, turn on both the msaa ifdef and one of msaa8
// or msaa16.

// This is a cut'n'paste w/ backdrop.
struct Tile {
backdrop: i32,
segments: u32,
Expand All @@ -18,8 +20,6 @@ var<uniform> config: Config;
@group(0) @binding(1)
var<storage> segments: array<Segment>;

#ifdef full

#import blend
#import ptcl

Expand All @@ -40,6 +40,309 @@ var gradients: texture_2d<f32>;
@group(0) @binding(6)
var image_atlas: texture_2d<f32>;

#ifdef msaa8
let MASK_WIDTH = 32u;
let MASK_HEIGHT = 32u;
let SH_SAMPLES_SIZE = 256u;
let SAMPLE_WORDS_PER_PIXEL = 1u;
// This might be better in uniform, but that has 16 byte alignment
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 256u>;
#endif

#ifdef msaa16
let MASK_WIDTH = 64u;
let MASK_HEIGHT = 64u;
let SH_SAMPLES_SIZE = 512u;
let SAMPLE_WORDS_PER_PIXEL = 2u;
@group(0) @binding(7)
var<storage> mask_lut: array<u32, 2048u>;
#endif

#ifdef msaa
let WG_SIZE = 64u;
var<workgroup> sh_count: array<u32, WG_SIZE>;

// This is 8 winding numbers packed to a u32, 4 bits per sample
var<workgroup> sh_winding: array<atomic<u32>, 32u>;
// Same packing, one group of 8 per pixel
var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
// Same packing, accumulating winding numbers for vertical edge crossings
var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;

// number of integer cells spanned by interval defined by a, b
fn span(a: f32, b: f32) -> u32 {
return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
}

let SEG_SIZE = 5u;

// New multisampled algorithm.
fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
if th_ix < 32u {
if th_ix < 2u {
atomicStore(&sh_winding_y[th_ix], 0x88888888u);
}
atomicStore(&sh_winding[th_ix], 0x88888888u);
}
let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
for (var i = 0u; i < sample_count; i++) {
atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
}
workgroupBarrier();
let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
for (var batch = 0u; batch < n_batch; batch++) {
let seg_ix = batch * WG_SIZE + th_ix;
let seg_off = fill.seg_data + seg_ix;
var count = 0u;
let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
// TODO: might save a register rewriting this in terms of limit
if th_ix < slice_size {
let segment = segments[seg_off];
// Note: coords relative to tile origin probably a good idea in coarse path,
// especially as f16 would work. But keeping existing scheme for compatibility.
let xy0 = segment.origin - tile_origin;
let xy1 = xy0 + segment.delta;
var y_edge_f = f32(TILE_HEIGHT);
var delta = select(-1, 1, xy1.x <= xy0.x);
if xy0.x == 0.0 && xy1.x == 0.0 {
if xy0.y == 0.0 {
y_edge_f = 0.0;
} else if xy1.y == 0.0 {
y_edge_f = 0.0;
delta = -delta;
}
} else {
if xy0.x == 0.0 {
if xy0.y != 0.0 {
y_edge_f = xy0.y;
}
} else if xy1.x == 0.0 && xy1.y != 0.0 {
y_edge_f = xy1.y;
}
// discard horizontal lines aligned to pixel grid
if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
}
}
let y_edge = u32(ceil(y_edge_f));
if y_edge < TILE_HEIGHT {
atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
}
}
// workgroup prefix sum of counts
sh_count[th_ix] = count;
let lg_n = firstLeadingBit(slice_size * 2u - 1u);
for (var i = 0u; i < lg_n; i++) {
workgroupBarrier();
if th_ix >= 1u << i {
count += sh_count[th_ix - (1u << i)];
}
workgroupBarrier();
sh_count[th_ix] = count;
}
#ifdef have_uniform
let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
#else
workgroupBarrier();
let total = sh_count[slice_size - 1u];
#endif
for (var i = th_ix; i < total; i += WG_SIZE) {
// binary search to find pixel
var lo = 0u;
var hi = slice_size;
let goal = i;
while hi > lo + 1u {
let mid = (lo + hi) >> 1u;
if goal >= sh_count[mid - 1u] {
lo = mid;
} else {
hi = mid;
}
}
let el_ix = lo;
let last_pixel = i + 1u == sh_count[el_ix];
let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
let seg_off = fill.seg_data + batch * WG_SIZE + el_ix;
let segment = segments[seg_off];
let xy0_in = segment.origin - tile_origin;
let xy1_in = xy0_in + segment.delta;
let is_down = xy1_in.y >= xy0_in.y;
let xy0 = select(xy1_in, xy0_in, is_down);
let xy1 = select(xy0_in, xy1_in, is_down);

// Set up data for line rasterization
// Note: this is duplicated work if total count exceeds a workgroup.
// One alternative is to compute it in a separate dispatch.
let dx = abs(xy1.x - xy0.x);
let dy = xy1.y - xy0.y;
let dy_dxdy = dy / (dx + dy);
let a = dx / (dx + dy);
let is_positive_slope = xy1.x >= xy0.x;
let sign = select(-1.0, 1.0, is_positive_slope);
let xt0 = floor(xy0.x * sign);
let c = xy0.x * sign - xt0;
// This has a special case in the JS code, but we should just not render
let y0i = floor(xy0.y);
let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
let b = dy_dxdy * c + a * (ytop - xy0.y);
let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
// Use line equation to plot pixel coordinates

let zf = a * f32(sub_ix) + b;
let z = floor(zf);
let x = x0i + i32(sign * z);
let y = i32(y0i) + i32(sub_ix) - i32(z);
var is_delta: bool;
// We need to adjust winding number if slope is positive and there
// is a crossing at the left edge of the pixel.
var is_bump = false;
let zp = floor(a * f32(sub_ix - 1u) + b);
if sub_ix == 0u {
is_delta = y0i == xy0.y && y0i != xy1.y;
is_bump = xy0.x == 0.0;
} else {
is_delta = z == zp;
is_bump = is_positive_slope && !is_delta;
}
let pix_ix = u32(y) * TILE_WIDTH + u32(x);
if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
let delta_pix = pix_ix + 1u;
if is_delta {
let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
atomicAdd(&sh_winding[delta_pix >> 3u], delta);
}
}
// Apply sample mask
let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
let half_height = f32(MASK_HEIGHT / 2u);
let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
let mask_col = floor((zf - z) * f32(MASK_WIDTH));
let mask_ix = mask_block + u32(mask_row + mask_col);
#ifdef msaa8
var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
mask &= 0xffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
mask &= 0xffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
mask &= ~(0xffu << mask_shift);
}
let mask_a = mask | (mask << 6u);
let mask_b = mask_a | (mask_a << 12u);
let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
if is_bump {
mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
}
atomicAdd(&sh_samples[pix_ix], mask_signed);
#endif
#ifdef msaa16
var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
mask &= 0xffffu;
// Intersect with y half-plane masks
if sub_ix == 0u && !is_bump {
let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
mask &= 0xffffu << mask_shift;
}
if last_pixel && xy1.x != 0.0 {
let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
mask &= ~(0xffffu << mask_shift);
}
let mask0 = mask & 0xffu;
let mask0_a = mask0 | (mask0 << 6u);
let mask0_b = mask0_a | (mask0_a << 12u);
let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
let mask1 = (mask >> 8u) & 0xffu;
let mask1_a = mask1 | (mask1 << 6u);
let mask1_b = mask1_a | (mask1_a << 12u);
let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
if is_bump {
let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
mask0_signed += bump_delta;
mask1_signed += bump_delta;
}
atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
#endif
}
workgroupBarrier();
}
var area: array<f32, PIXELS_PER_THREAD>;
let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
var packed_w = atomicLoad(&sh_winding[major]);
// Prefix sum of packed 4 bit values within u32
packed_w += (packed_w - 0x8888888u) << 4u;
packed_w += (packed_w - 0x888888u) << 8u;
packed_w += (packed_w - 0x8888u) << 16u;
// Note: could probably do bias in one go, but it would be inscrutable
if (major & 1u) != 0u {
// We could use shmem to communicate the value from another thread;
// if we had subgroups that would almost certainly be the most
// efficient way. But we just calculate again for simplicity.
var last_packed = atomicLoad(&sh_winding[major - 1u]);
last_packed += (last_packed - 0x8888888u) << 4u;
last_packed += (last_packed - 0x888888u) << 8u;
last_packed += (last_packed - 0x8888u) << 16u;
let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
packed_w += bump;
}
var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
packed_y += (packed_y - 0x8888888u) << 4u;
packed_y += (packed_y - 0x888888u) << 8u;
packed_y += (packed_y - 0x8888u) << 16u;
if th_ix == 0u {
atomicStore(&sh_winding_y[0], packed_y);
}
workgroupBarrier();
var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
if local_id.y >= 8u {
wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
}

for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
let pix_ix = th_ix * PIXELS_PER_THREAD + i;
let minor = pix_ix & 7u;
//let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
// TODO: math might be off here
let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(fill.backdrop);
if expected_zero >= 16u {
area[i] = 1.0;
} else {
#ifdef msaa8
let samples = atomicLoad(&sh_samples[pix_ix]);
let xored = (expected_zero * 0x11111111u) ^ samples;
// Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
let xored2 = xored | (xored * 2u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
#endif
#ifdef msaa16
let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
let xored0 = (expected_zero * 0x11111111u) ^ samples0;
let xored0_2 = xored0 | (xored0 * 2u);
let xored1 = (expected_zero * 0x11111111u) ^ samples1;
let xored1_2 = xored1 | (xored1 >> 1u);
let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
let xored4 = xored2 | (xored2 * 4u);
area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
#endif
}
}
return area;
}
#endif

fn read_fill(cmd_ix: u32) -> CmdFill {
let size_and_rule = ptcl[cmd_ix + 1u];
let seg_data = ptcl[cmd_ix + 2u];
Expand Down Expand Up @@ -126,15 +429,12 @@ fn extend_mode(t: f32, mode: u32) -> f32 {
}
}

#else

@group(0) @binding(3)
var output: texture_storage_2d<r8, write>;

#endif

let PIXELS_PER_THREAD = 4u;

// Analytic area antialiasing.
//
// This is currently dead code if msaa is enabled, but it would be fairly straightforward
// to wire this so it's a dynamic choice (even per-path).
fn fill_path(fill: CmdFill, xy: vec2<f32>) -> array<f32, PIXELS_PER_THREAD> {
let n_segs = fill.size_and_rule >> 1u;
let even_odd = (fill.size_and_rule & 1u) != 0u;
Expand Down Expand Up @@ -220,7 +520,11 @@ fn main(
// CMD_FILL
case 1u: {
let fill = read_fill(cmd_ix);
#ifdef msaa
area = fill_path_ms(fill, wg_id.xy, local_id.xy);
#else
area = fill_path(fill, xy);
#endif
cmd_ix += 4u;
}
// CMD_STROKE
Expand Down
14 changes: 14 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
mod cpu_dispatch;
mod cpu_shader;
mod engine;
mod mask;
mod render;
mod scene;
mod shaders;
Expand Down Expand Up @@ -61,6 +62,19 @@ pub type Error = Box<dyn std::error::Error>;
/// Specialization of `Result` for our catch-all error type.
pub type Result<T> = std::result::Result<T, Error>;

/// Possible configurations for antialiasing.
#[derive(PartialEq, Eq)]
#[allow(unused)]
enum AaConfig {
Area,
Msaa8,
Msaa16,
}

/// Configuration of antialiasing. Currently this is static, but could be switched to
/// a launch option or even finer-grained.
const ANTIALIASING: AaConfig = AaConfig::Msaa16;

/// Renders a scene into a texture or surface.
#[cfg(feature = "wgpu")]
pub struct Renderer {
Expand Down
Loading

0 comments on commit 2f36411

Please sign in to comment.