Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checkpoint faster bbox computation #259

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"wgsl-analyzer.customImports": {
"bbox": "${workspaceFolder}/shader/shared/bbox.wgsl",
"bbox_monoid": "${workspaceFolder}/shader/shared/bbox_monoid.wgsl",
"blend": "${workspaceFolder}/shader/shared/blend.wgsl",
"bump": "${workspaceFolder}/shader/shared/bump.wgsl",
"clip": "${workspaceFolder}/shader/shared/clip.wgsl",
Expand Down
31 changes: 0 additions & 31 deletions shader/bbox_clear.wgsl

This file was deleted.

75 changes: 75 additions & 0 deletions shader/bbox_fixup.wgsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

#import config
#import pathtag
#import bbox
#import bbox_monoid

@group(0) @binding(0)
var<uniform> config: Config;

@group(0) @binding(1)
var<storage> tag_monoids: array<TagMonoid>;

@group(0) @binding(2)
var<storage> bbox_reduced: array<BboxMonoid>;

@group(0) @binding(3)
var<storage, read_write> path_bboxes: array<PathBbox>;

let WG_SIZE = 256u;
var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;

fn round_down(x: f32) -> i32 {
return i32(floor(x));
}

fn round_up(x: f32) -> i32 {
return i32(ceil(x));
}

// In the configuration with <= 64k pathtags, there's only one
// workgroup here, so the distinction between global and local is
// not meaningful. But we'll probably want to #ifdef a larger
// configuration, in which we also bind a doubly reduced buffer.
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
) {
var agg: BboxMonoid;
if global_id.x * WG_SIZE < config.n_pathtag {
agg = bbox_reduced[global_id.x];
}
sh_bbox[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
workgroupBarrier();
if local_id.x >= 1u << i {
let other = sh_bbox[local_id.x - (1u << i)];
agg = combine_bbox_monoid(other, agg);
}
workgroupBarrier();
sh_bbox[local_id.x] = agg;
}
// Explanation of this trick: we don't need to fix up first bbox.
// By offsetting the index, we can use the inclusive scan.
let ix = global_id.x + 1u;
if ix * WG_SIZE < config.n_pathtag {
// First path of the workgroup.
let path_ix = tag_monoids[ix * (WG_SIZE / 4u)].path_ix;
if (agg.flags & FLAG_RESET_BBOX) == 0u && (agg.bbox.z > agg.bbox.x || agg.bbox.w > agg.bbox.y) {
let out = &path_bboxes[path_ix];
// TODO: casting goes away
var bbox = vec4(f32((*out).x0), f32((*out).y0), f32((*out).x1), f32((*out).y1));
if bbox.z > bbox.x || bbox.w > bbox.y {
bbox = vec4(min(agg.bbox.xy, bbox.xy), max(agg.bbox.zw, bbox.zw));
} else {
bbox = agg.bbox;
}
(*out).x0 = round_down(bbox.x);
(*out).y0 = round_down(bbox.y);
(*out).x1 = round_up(bbox.z);
(*out).y1 = round_up(bbox.w);
}
}
}
87 changes: 35 additions & 52 deletions shader/pathseg.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
// There's some duplication of the decoding code but we won't worry about
// that just now. Perhaps it could be factored more nicely later.

#import bbox
#import bbox_monoid
#import config
#import pathtag
#import cubic
Expand All @@ -24,51 +26,14 @@ var<storage> scene: array<u32>;
@group(0) @binding(2)
var<storage> tag_monoids: array<TagMonoid>;

struct AtomicPathBbox {
x0: atomic<i32>,
y0: atomic<i32>,
x1: atomic<i32>,
y1: atomic<i32>,
linewidth: f32,
trans_ix: u32,
}

@group(0) @binding(3)
var<storage, read_write> path_bboxes: array<AtomicPathBbox>;

var<storage, read_write> path_bboxes: array<PathBbox>;

@group(0) @binding(4)
var<storage, read_write> cubics: array<Cubic>;

// Monoid is yagni, for future optimization

// struct BboxMonoid {
// bbox: vec4<f32>,
// flags: u32,
// }

// let FLAG_RESET_BBOX = 1u;
// let FLAG_SET_BBOX = 2u;

// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
// var c: BboxMonoid;
// c.bbox = b.bbox;
// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also
// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y {
// c.bbox = a.bbox;
// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u ||
// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)
// {
// c.bbox = vec4<f32>(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw));
// }
// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags;
// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u;
// return c;
// }

// fn bbox_monoid_identity() -> BboxMonoid {
// return BboxMonoid();
// }
@group(0) @binding(5)
var<storage, read_write> bbox_reduced: array<BboxMonoid>;

var<private> pathdata_base: u32;

Expand Down Expand Up @@ -115,10 +80,14 @@ fn round_up(x: f32) -> i32 {
return i32(ceil(x));
}

let WG_SIZE = 256u;
var<workgroup> sh_bbox: array<BboxMonoid, WG_SIZE>;

@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
) {
let ix = global_id.x;
let tag_word = scene[config.pathtag_base + (ix >> 2u)];
Expand All @@ -130,12 +99,10 @@ fn main(

let out = &path_bboxes[tm.path_ix];
let linewidth = bitcast<f32>(scene[config.linewidth_base + tm.linewidth_ix]);
if (tag_byte & PATH_TAG_PATH) != 0u {
(*out).linewidth = linewidth;
(*out).trans_ix = tm.trans_ix;
}
let bbox_flags = u32((tag_byte & PATH_TAG_PATH) != 0u);
// Decode path data
let seg_type = tag_byte & PATH_TAG_SEG_TYPE;
var bbox: vec4<f32>;
if seg_type != 0u {
var p0: vec2<f32>;
var p1: vec2<f32>;
Expand Down Expand Up @@ -163,7 +130,7 @@ fn main(
let transform = read_transform(config.transform_base, tm.trans_ix);
p0 = transform_apply(transform, p0);
p1 = transform_apply(transform, p1);
var bbox = vec4(min(p0, p1), max(p0, p1));
bbox = vec4(min(p0, p1), max(p0, p1));
// Degree-raise
if seg_type == PATH_TAG_LINETO {
p3 = p1;
Expand Down Expand Up @@ -191,13 +158,29 @@ fn main(
}
let flags = u32(linewidth >= 0.0);
cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags);
// Update bounding box using atomics only. Computing a monoid is a
// potential future optimization.
if bbox.z > bbox.x || bbox.w > bbox.y {
atomicMin(&(*out).x0, round_down(bbox.x));
atomicMin(&(*out).y0, round_down(bbox.y));
atomicMax(&(*out).x1, round_up(bbox.z));
atomicMax(&(*out).y1, round_up(bbox.w));
}
var agg = BboxMonoid(bbox, bbox_flags);
sh_bbox[local_id.x] = agg;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i++) {
workgroupBarrier();
if local_id.x >= 1u << i {
let other = sh_bbox[local_id.x - (1u << i)];
agg = combine_bbox_monoid(other, agg);
}
workgroupBarrier();
sh_bbox[local_id.x] = agg;
}
if local_id.x == WG_SIZE - 1u {
bbox_reduced[wg_id.x] = agg;
}
if bbox_flags != 0u {
let out = &path_bboxes[tm.path_ix];
// TODO: now that we're not atomic, don't need fixed-point
(*out).x0 = round_down(agg.bbox.x);
(*out).y0 = round_down(agg.bbox.y);
(*out).x1 = round_up(agg.bbox.z);
(*out).y1 = round_up(agg.bbox.w);
(*out).linewidth = linewidth;
(*out).trans_ix = tm.trans_ix;
}
}
25 changes: 25 additions & 0 deletions shader/shared/bbox_monoid.wgsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

struct BboxMonoid {
bbox: vec4<f32>,
flags: u32,
}

let FLAG_RESET_BBOX = 1u;
let FLAG_SET_BBOX = 2u;

// Technically this is a semigroup with a left identity rather than a
// true monoid, but that is good enough for our purposes.
fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid {
var bbox = b.bbox;
if (b.flags & FLAG_SET_BBOX) == 0u && (a.flags & FLAG_RESET_BBOX) == 0u {
if bbox.z <= bbox.x && bbox.w <= bbox.y {
bbox = a.bbox;
} else if a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y {
bbox = vec4(min(a.bbox.xy, bbox.xy), max(a.bbox.zw, bbox.zw));
}
}
let flags = ((a.flags | (a.flags << 1u)) & FLAG_SET_BBOX) | b.flags;
return BboxMonoid(bbox, flags);
}

1 change: 1 addition & 0 deletions shader/shared/config.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ struct Config {
n_drawobj: u32,
n_path: u32,
n_clip: u32,
n_pathtag: u32,

// To reduce the number of bindings, info and bin data are combined
// into one buffer.
Expand Down
5 changes: 4 additions & 1 deletion src/encoding/packed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ pub struct Layout {
pub n_paths: u32,
/// Number of clips.
pub n_clips: u32,
/// Number of path tags.
pub n_pathtag: u32,
/// Start of binning data.
pub bin_data_start: u32,
/// Start of path tag stream.
Expand Down Expand Up @@ -139,13 +141,14 @@ impl PackedEncoding {
// Pack encoded data.
let layout = &mut self.layout;
*layout = Layout::default();
let n_path_tags = encoding.path_tags.len();
layout.n_paths = encoding.n_paths;
layout.n_draw_objects = encoding.n_paths;
layout.n_clips = encoding.n_clips;
layout.n_pathtag = n_path_tags as u32;
let data = &mut self.data;
data.clear();
// Path tag stream
let n_path_tags = encoding.path_tags.len();
let path_tag_padded = align_up(n_path_tags, 4 * shaders::PATHTAG_REDUCE_WG);
let capacity = path_tag_padded
+ slice_size_in_bytes(&encoding.path_data)
Expand Down
16 changes: 11 additions & 5 deletions src/render.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ const PATH_SIZE: u64 = 32;
const DRAW_BBOX_SIZE: u64 = 16;
const BUMP_SIZE: u64 = 16;
const BIN_HEADER_SIZE: u64 = 8;
const BBOX_MONOID_SIZE: u64 = 32;

// Note: this is defined here as it's still used by the reduced pipeline,
// but for the full pipeline, use the version in `encoding`.
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
struct Config {
Expand All @@ -34,6 +37,7 @@ struct Config {
n_drawobj: u32,
n_path: u32,
n_clip: u32,
n_pathtag: u32,
bin_data_start: u32,
pathtag_base: u32,
pathdata_base: u32,
Expand Down Expand Up @@ -202,14 +206,10 @@ pub fn render_encoding_full(
);
let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG;
let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE);
recording.dispatch(
shaders.bbox_clear,
(drawobj_wgs, 1, 1),
[config_buf, path_bbox_buf],
);
let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE);
let path_coarse_wgs =
(n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG;
let bbox_reduced_buf = ResourceProxy::new_buf(path_coarse_wgs as u64 * BBOX_MONOID_SIZE);
recording.dispatch(
shaders.pathseg,
(path_coarse_wgs, 1, 1),
Expand All @@ -219,8 +219,14 @@ pub fn render_encoding_full(
tagmonoid_buf,
path_bbox_buf,
cubic_buf,
bbox_reduced_buf,
],
);
recording.dispatch(
shaders.bbox_fixup,
(1, 1, 1),
[config_buf, tagmonoid_buf, bbox_reduced_buf, path_bbox_buf],
);
let draw_reduced_buf = ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE);
recording.dispatch(
shaders.draw_reduce,
Expand Down
Loading