diff --git a/.vscode/settings.json b/.vscode/settings.json index 883465a1a..6deaef71a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "wgsl-analyzer.customImports": { "bbox": "${workspaceFolder}/shader/shared/bbox.wgsl", + "bbox_monoid": "${workspaceFolder}/shader/shared/bbox_monoid.wgsl", "blend": "${workspaceFolder}/shader/shared/blend.wgsl", "bump": "${workspaceFolder}/shader/shared/bump.wgsl", "clip": "${workspaceFolder}/shader/shared/clip.wgsl", diff --git a/shader/bbox_clear.wgsl b/shader/bbox_clear.wgsl deleted file mode 100644 index fe8ccebbd..000000000 --- a/shader/bbox_clear.wgsl +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config - -@group(0) @binding(0) -var config: Config; - -struct PathBbox { - x0: i32, - y0: i32, - x1: i32, - y1: i32, - linewidth: f32, - trans_ix: u32, -} - -@group(0) @binding(1) -var path_bboxes: array; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, -) { - let ix = global_id.x; - if ix < config.n_path { - path_bboxes[ix].x0 = 0x7fffffff; - path_bboxes[ix].y0 = 0x7fffffff; - path_bboxes[ix].x1 = -0x80000000; - path_bboxes[ix].y1 = -0x80000000; - } -} diff --git a/shader/bbox_fixup.wgsl b/shader/bbox_fixup.wgsl new file mode 100644 index 000000000..d336466fd --- /dev/null +++ b/shader/bbox_fixup.wgsl @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +#import config +#import pathtag +#import bbox +#import bbox_monoid + +@group(0) @binding(0) +var config: Config; + +@group(0) @binding(1) +var tag_monoids: array; + +@group(0) @binding(2) +var bbox_reduced: array; + +@group(0) @binding(3) +var path_bboxes: array; + +let WG_SIZE = 256u; +var sh_bbox: array; + +fn round_down(x: f32) -> i32 { + return i32(floor(x)); +} + +fn round_up(x: f32) -> i32 { + return i32(ceil(x)); +} + +// In the configuration with <= 64k pathtags, there's only one +// workgroup here, so the distinction between global and local is +// not meaningful. But we'll probably want to #ifdef a larger +// configuration, in which we also bind a doubly reduced buffer. +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, + @builtin(local_invocation_id) local_id: vec3, +) { + var agg: BboxMonoid; + if global_id.x * WG_SIZE < config.n_pathtag { + agg = bbox_reduced[global_id.x]; + } + sh_bbox[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_bbox[local_id.x - (1u << i)]; + agg = combine_bbox_monoid(other, agg); + } + workgroupBarrier(); + sh_bbox[local_id.x] = agg; + } + // Explanation of this trick: we don't need to fix up first bbox. + // By offsetting the index, we can use the inclusive scan. + let ix = global_id.x + 1u; + if ix * WG_SIZE < config.n_pathtag { + // First path of the workgroup. + let path_ix = tag_monoids[ix * (WG_SIZE / 4u)].path_ix; + if (agg.flags & FLAG_RESET_BBOX) == 0u && (agg.bbox.z > agg.bbox.x || agg.bbox.w > agg.bbox.y) { + let out = &path_bboxes[path_ix]; + // TODO: casting goes away + var bbox = vec4(f32((*out).x0), f32((*out).y0), f32((*out).x1), f32((*out).y1)); + if bbox.z > bbox.x || bbox.w > bbox.y { + bbox = vec4(min(agg.bbox.xy, bbox.xy), max(agg.bbox.zw, bbox.zw)); + } else { + bbox = agg.bbox; + } + (*out).x0 = round_down(bbox.x); + (*out).y0 = round_down(bbox.y); + (*out).x1 = round_up(bbox.z); + (*out).y1 = round_up(bbox.w); + } + } +} diff --git a/shader/pathseg.wgsl b/shader/pathseg.wgsl index ec059abb7..a8f32f22d 100644 --- a/shader/pathseg.wgsl +++ b/shader/pathseg.wgsl @@ -11,6 +11,8 @@ // There's some duplication of the decoding code but we won't worry about // that just now. Perhaps it could be factored more nicely later. +#import bbox +#import bbox_monoid #import config #import pathtag #import cubic @@ -24,51 +26,14 @@ var scene: array; @group(0) @binding(2) var tag_monoids: array; -struct AtomicPathBbox { - x0: atomic, - y0: atomic, - x1: atomic, - y1: atomic, - linewidth: f32, - trans_ix: u32, -} - @group(0) @binding(3) -var path_bboxes: array; - +var path_bboxes: array; @group(0) @binding(4) var cubics: array; -// Monoid is yagni, for future optimization - -// struct BboxMonoid { -// bbox: vec4, -// flags: u32, -// } - -// let FLAG_RESET_BBOX = 1u; -// let FLAG_SET_BBOX = 2u; - -// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { -// var c: BboxMonoid; -// c.bbox = b.bbox; -// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also -// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y { -// c.bbox = a.bbox; -// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u || -// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y) -// { -// c.bbox = vec4(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw)); -// } -// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags; -// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u; -// return c; -// } - -// fn bbox_monoid_identity() -> BboxMonoid { -// return BboxMonoid(); -// } +@group(0) @binding(5) +var bbox_reduced: array; var pathdata_base: u32; @@ -115,10 +80,14 @@ fn round_up(x: f32) -> i32 { return i32(ceil(x)); } +let WG_SIZE = 256u; +var sh_bbox: array; + @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) wg_id: vec3, ) { let ix = global_id.x; let tag_word = scene[config.pathtag_base + (ix >> 2u)]; @@ -130,12 +99,10 @@ fn main( let out = &path_bboxes[tm.path_ix]; let linewidth = bitcast(scene[config.linewidth_base + tm.linewidth_ix]); - if (tag_byte & PATH_TAG_PATH) != 0u { - (*out).linewidth = linewidth; - (*out).trans_ix = tm.trans_ix; - } + let bbox_flags = u32((tag_byte & PATH_TAG_PATH) != 0u); // Decode path data let seg_type = tag_byte & PATH_TAG_SEG_TYPE; + var bbox: vec4; if seg_type != 0u { var p0: vec2; var p1: vec2; @@ -163,7 +130,7 @@ fn main( let transform = read_transform(config.transform_base, tm.trans_ix); p0 = transform_apply(transform, p0); p1 = transform_apply(transform, p1); - var bbox = vec4(min(p0, p1), max(p0, p1)); + bbox = vec4(min(p0, p1), max(p0, p1)); // Degree-raise if seg_type == PATH_TAG_LINETO { p3 = p1; @@ -191,13 +158,29 @@ fn main( } let flags = u32(linewidth >= 0.0); cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags); - // Update bounding box using atomics only. Computing a monoid is a - // potential future optimization. - if bbox.z > bbox.x || bbox.w > bbox.y { - atomicMin(&(*out).x0, round_down(bbox.x)); - atomicMin(&(*out).y0, round_down(bbox.y)); - atomicMax(&(*out).x1, round_up(bbox.z)); - atomicMax(&(*out).y1, round_up(bbox.w)); + } + var agg = BboxMonoid(bbox, bbox_flags); + sh_bbox[local_id.x] = agg; + for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_bbox[local_id.x - (1u << i)]; + agg = combine_bbox_monoid(other, agg); } + workgroupBarrier(); + sh_bbox[local_id.x] = agg; + } + if local_id.x == WG_SIZE - 1u { + bbox_reduced[wg_id.x] = agg; + } + if bbox_flags != 0u { + let out = &path_bboxes[tm.path_ix]; + // TODO: now that we're not atomic, don't need fixed-point + (*out).x0 = round_down(agg.bbox.x); + (*out).y0 = round_down(agg.bbox.y); + (*out).x1 = round_up(agg.bbox.z); + (*out).y1 = round_up(agg.bbox.w); + (*out).linewidth = linewidth; + (*out).trans_ix = tm.trans_ix; } } diff --git a/shader/shared/bbox_monoid.wgsl b/shader/shared/bbox_monoid.wgsl new file mode 100644 index 000000000..1a301d296 --- /dev/null +++ b/shader/shared/bbox_monoid.wgsl @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +struct BboxMonoid { + bbox: vec4, + flags: u32, +} + +let FLAG_RESET_BBOX = 1u; +let FLAG_SET_BBOX = 2u; + +// Technically this is a semigroup with a left identity rather than a +// true monoid, but that is good enough for our purposes. +fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { + var bbox = b.bbox; + if (b.flags & FLAG_SET_BBOX) == 0u && (a.flags & FLAG_RESET_BBOX) == 0u { + if bbox.z <= bbox.x && bbox.w <= bbox.y { + bbox = a.bbox; + } else if a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y { + bbox = vec4(min(a.bbox.xy, bbox.xy), max(a.bbox.zw, bbox.zw)); + } + } + let flags = ((a.flags | (a.flags << 1u)) & FLAG_SET_BBOX) | b.flags; + return BboxMonoid(bbox, flags); +} + diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl index 0cb56d89a..6ef9434c2 100644 --- a/shader/shared/config.wgsl +++ b/shader/shared/config.wgsl @@ -10,6 +10,7 @@ struct Config { n_drawobj: u32, n_path: u32, n_clip: u32, + n_pathtag: u32, // To reduce the number of bindings, info and bin data are combined // into one buffer. diff --git a/src/encoding/packed.rs b/src/encoding/packed.rs index 700447729..34f7aef79 100644 --- a/src/encoding/packed.rs +++ b/src/encoding/packed.rs @@ -32,6 +32,8 @@ pub struct Layout { pub n_paths: u32, /// Number of clips. pub n_clips: u32, + /// Number of path tags. + pub n_pathtag: u32, /// Start of binning data. pub bin_data_start: u32, /// Start of path tag stream. @@ -139,13 +141,14 @@ impl PackedEncoding { // Pack encoded data. let layout = &mut self.layout; *layout = Layout::default(); + let n_path_tags = encoding.path_tags.len(); layout.n_paths = encoding.n_paths; layout.n_draw_objects = encoding.n_paths; layout.n_clips = encoding.n_clips; + layout.n_pathtag = n_path_tags as u32; let data = &mut self.data; data.clear(); // Path tag stream - let n_path_tags = encoding.path_tags.len(); let path_tag_padded = align_up(n_path_tags, 4 * shaders::PATHTAG_REDUCE_WG); let capacity = path_tag_padded + slice_size_in_bytes(&encoding.path_data) diff --git a/src/render.rs b/src/render.rs index 0ee195aa8..fb5c253bb 100644 --- a/src/render.rs +++ b/src/render.rs @@ -23,7 +23,10 @@ const PATH_SIZE: u64 = 32; const DRAW_BBOX_SIZE: u64 = 16; const BUMP_SIZE: u64 = 16; const BIN_HEADER_SIZE: u64 = 8; +const BBOX_MONOID_SIZE: u64 = 32; +// Note: this is defined here as it's still used by the reduced pipeline, +// but for the full pipeline, use the version in `encoding`. #[repr(C)] #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] struct Config { @@ -34,6 +37,7 @@ struct Config { n_drawobj: u32, n_path: u32, n_clip: u32, + n_pathtag: u32, bin_data_start: u32, pathtag_base: u32, pathdata_base: u32, @@ -202,14 +206,10 @@ pub fn render_encoding_full( ); let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE); - recording.dispatch( - shaders.bbox_clear, - (drawobj_wgs, 1, 1), - [config_buf, path_bbox_buf], - ); let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE); let path_coarse_wgs = (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; + let bbox_reduced_buf = ResourceProxy::new_buf(path_coarse_wgs as u64 * BBOX_MONOID_SIZE); recording.dispatch( shaders.pathseg, (path_coarse_wgs, 1, 1), @@ -219,8 +219,14 @@ pub fn render_encoding_full( tagmonoid_buf, path_bbox_buf, cubic_buf, + bbox_reduced_buf, ], ); + recording.dispatch( + shaders.bbox_fixup, + (1, 1, 1), + [config_buf, tagmonoid_buf, bbox_reduced_buf, path_bbox_buf], + ); let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE); recording.dispatch( shaders.draw_reduce, diff --git a/src/shaders.rs b/src/shaders.rs index 1df6b1477..9371d1a07 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -48,8 +48,8 @@ pub struct Shaders { pub struct FullShaders { pub pathtag_reduce: ShaderId, pub pathtag_scan: ShaderId, - pub bbox_clear: ShaderId, pub pathseg: ShaderId, + pub bbox_fixup: ShaderId, pub draw_reduce: ShaderId, pub draw_leaf: ShaderId, pub clip_reduce: ShaderId, @@ -144,11 +144,6 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result Result Result