From 2dda2008fac33c47077644358476dc5e66fdba6e Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Mon, 13 May 2024 15:27:59 +0200 Subject: [PATCH 01/18] Stash robustness thoughts --- vello/src/lib.rs | 1 + vello/src/robust.rs | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 vello/src/robust.rs diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 419b5ad93..85affba29 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -84,6 +84,7 @@ mod debug; mod recording; mod render; +mod robust; mod scene; mod shaders; #[cfg(feature = "wgpu")] diff --git a/vello/src/robust.rs b/vello/src/robust.rs new file mode 100644 index 000000000..05ed18b30 --- /dev/null +++ b/vello/src/robust.rs @@ -0,0 +1,12 @@ +//! A discussion of Vello's robust dynamic memory support +//! +//! When running the Vello pipeline, there are several buffers which: +//! 1) Need to be large enough to store +//! 2) Have a size which is non-trivial to calculate before running the pipeline +//! +//! When using wgpu (and most GPU apis), it is not possible for the GPU to synchronously +//! request a larger buffer, so we have to provide a best-effort buffer for this purpose. +//! +//! ## Handling failures +//! +//! If the buffer which was provided was too small, we have an issue. From 5e630b5a6e014a236c2a0f74d9d9da44dca7e753 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:55:19 +0100 Subject: [PATCH 02/18] Setup early exit in all stages --- vello/src/render.rs | 7 ++- vello/src/shaders.rs | 3 + vello/src/wgpu_engine.rs | 4 +- vello_encoding/src/config.rs | 5 ++ vello_shaders/shader/backdrop.wgsl | 3 + vello_shaders/shader/bbox_clear.wgsl | 3 + vello_shaders/shader/clip_leaf.wgsl | 3 + vello_shaders/shader/clip_reduce.wgsl | 1 + vello_shaders/shader/coarse.wgsl | 2 +- vello_shaders/shader/draw_leaf.wgsl | 12 ++-- vello_shaders/shader/draw_reduce.wgsl | 3 + vello_shaders/shader/flatten.wgsl | 12 ++-- vello_shaders/shader/path_count.wgsl | 1 + vello_shaders/shader/path_tiling.wgsl | 1 + vello_shaders/shader/pathtag_reduce.wgsl | 3 + vello_shaders/shader/pathtag_reduce2.wgsl | 1 + vello_shaders/shader/pathtag_scan.wgsl | 3 + vello_shaders/shader/pathtag_scan1.wgsl | 1 + vello_shaders/shader/prepare.wgsl | 70 +++++++++++++++++++++++ vello_shaders/shader/shared/bump.wgsl | 1 + vello_shaders/shader/shared/config.wgsl | 3 + vello_shaders/shader/tile_alloc.wgsl | 2 +- 22 files changed, 126 insertions(+), 18 deletions(-) create mode 100644 vello_shaders/shader/prepare.wgsl diff --git a/vello/src/render.rs b/vello/src/render.rs index 952ef498a..d6d2280dd 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -195,6 +195,10 @@ impl Render { buffer_sizes.path_reduced.size_in_bytes().into(), "reduced_buf", ); + let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); + recording.clear_all(bump_buf); + let bump_buf = ResourceProxy::Buffer(bump_buf); + recording.dispatch(shaders.prepare, (1, 1, 1), [config_buf, bump_buf]); // TODO: really only need pathtag_wgs - 1 recording.dispatch( shaders.pathtag_reduce, @@ -255,9 +259,6 @@ impl Render { wg_counts.bbox_clear, [config_buf, path_bbox_buf], ); - let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); - recording.clear_all(bump_buf); - let bump_buf = ResourceProxy::Buffer(bump_buf); let lines_buf = ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf"); recording.dispatch( diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index a58e0edfe..e0550b5a2 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -17,6 +17,7 @@ use crate::{ // Shaders for the full pipeline pub struct FullShaders { + pub prepare: ShaderId, pub pathtag_reduce: ShaderId, pub pathtag_reduce2: ShaderId, pub pathtag_scan1: ShaderId, @@ -101,6 +102,7 @@ pub(crate) fn full_shaders( }; } + let prepare = add_shader!(prepare, [Buffer, Buffer], CpuShaderType::Skipped); let pathtag_reduce = add_shader!(pathtag_reduce, [Uniform, BufReadOnly, Buffer]); let pathtag_reduce2 = add_shader!( pathtag_reduce2, @@ -249,6 +251,7 @@ pub(crate) fn full_shaders( }; Ok(FullShaders { + prepare, pathtag_reduce, pathtag_reduce2, pathtag_scan, diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs index b10cbac5c..fa3e80b3d 100644 --- a/vello/src/wgpu_engine.rs +++ b/vello/src/wgpu_engine.rs @@ -409,7 +409,9 @@ impl WgpuEngine { transient_map .bufs .insert(buf_proxy.id, TransientBuf::Cpu(bytes)); - let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST; + // TODO: More principled way of working out usages + let usage = + BufferUsages::UNIFORM | BufferUsages::COPY_DST | BufferUsages::STORAGE; // Same consideration as above let buf = self .pool diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 88da7fd46..db5bb92c8 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -137,6 +137,10 @@ pub struct ConfigUniform { pub base_color: u32, /// Layout of packed scene data. pub layout: Layout, + /// Whether this stage has been cancelled at startup due to a predicted + /// + /// Will be set by the `prepare` stage, and so should always be 0 on CPU. + pub cancelled: u32, /// Size of line soup buffer allocation (in [`LineSoup`]s) pub lines_size: u32, /// Size of binning buffer allocation (in `u32`s). @@ -178,6 +182,7 @@ impl RenderConfig { Self { gpu: ConfigUniform { width_in_tiles, + cancelled: false.into(), height_in_tiles, target_width: width, target_height: height, diff --git a/vello_shaders/shader/backdrop.wgsl b/vello_shaders/shader/backdrop.wgsl index acf97aa2f..78facafad 100644 --- a/vello_shaders/shader/backdrop.wgsl +++ b/vello_shaders/shader/backdrop.wgsl @@ -26,6 +26,9 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + if config.cancelled != 0u { + return; + } let width_in_tiles = config.width_in_tiles; let ix = wg_id.x * width_in_tiles + local_id.x; var backdrop = 0; diff --git a/vello_shaders/shader/bbox_clear.wgsl b/vello_shaders/shader/bbox_clear.wgsl index 067d7c2dd..af1a0dc32 100644 --- a/vello_shaders/shader/bbox_clear.wgsl +++ b/vello_shaders/shader/bbox_clear.wgsl @@ -14,6 +14,9 @@ var path_bboxes: array; fn main( @builtin(global_invocation_id) global_id: vec3, ) { + if config.cancelled != 0u { + return; + } let ix = global_id.x; if ix < config.n_path { path_bboxes[ix].x0 = 0x7fffffff; diff --git a/vello_shaders/shader/clip_leaf.wgsl b/vello_shaders/shader/clip_leaf.wgsl index e947177d6..eb222a930 100644 --- a/vello_shaders/shader/clip_leaf.wgsl +++ b/vello_shaders/shader/clip_leaf.wgsl @@ -83,6 +83,9 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + if config.cancelled != 0u { + return; + } var bic: Bic; if local_id.x < wg_id.x { bic = reduced[local_id.x]; diff --git a/vello_shaders/shader/clip_reduce.wgsl b/vello_shaders/shader/clip_reduce.wgsl index bdbc0e989..4ec28b72b 100644 --- a/vello_shaders/shader/clip_reduce.wgsl +++ b/vello_shaders/shader/clip_reduce.wgsl @@ -27,6 +27,7 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + // TODO: Cancel if this entire run has been cancelled? let inp = clip_inp[global_id.x].path_ix; let is_push = inp >= 0; var bic = Bic(1u - u32(is_push), u32(is_push)); diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl index 6856396b4..ba0edfa8f 100644 --- a/vello_shaders/shader/coarse.wgsl +++ b/vello_shaders/shader/coarse.wgsl @@ -155,7 +155,7 @@ fn main( // We need to check only prior stages, as if this stage has failed in another workgroup, // we still want to know this workgroup's memory requirement. if local_id.x == 0u { - var failed = atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_FLATTEN); + var failed = atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_FLATTEN | PREVIOUS_RUN); if atomicLoad(&bump.seg_counts) > config.seg_counts_size { failed |= STAGE_PATH_COUNT; } diff --git a/vello_shaders/shader/draw_leaf.wgsl b/vello_shaders/shader/draw_leaf.wgsl index 2ef76db80..2b87798d3 100644 --- a/vello_shaders/shader/draw_leaf.wgsl +++ b/vello_shaders/shader/draw_leaf.wgsl @@ -54,6 +54,9 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + if config.cancelled != 0u { + return; + } // Reduce prefix of workgroups up to this one var agg = draw_monoid_identity(); if local_id.x < wg_id.x { @@ -108,10 +111,7 @@ fn main( } let dd = config.drawdata_base + m.scene_offset; let di = m.info_offset; - if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT || - tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || - tag_word == DRAWTAG_FILL_IMAGE || tag_word == DRAWTAG_BEGIN_CLIP - { + if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE || tag_word == DRAWTAG_BEGIN_CLIP { let bbox = path_bbox[m.path_ix]; // TODO: bbox is mostly yagni here, sort that out. Maybe clips? // let x0 = f32(bbox.x0); @@ -121,9 +121,7 @@ fn main( // let bbox_f = vec4(x0, y0, x1, y1); var transform = Transform(); let draw_flags = bbox.draw_flags; - if tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || - tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE - { + if tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE { transform = read_transform(config.transform_base, bbox.trans_ix); } switch tag_word { diff --git a/vello_shaders/shader/draw_reduce.wgsl b/vello_shaders/shader/draw_reduce.wgsl index 7a12b8188..5acaf6a18 100644 --- a/vello_shaders/shader/draw_reduce.wgsl +++ b/vello_shaders/shader/draw_reduce.wgsl @@ -24,6 +24,9 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + if config.cancelled != 0u { + return; + } let num_blocks_total = (config.n_drawobj + (WG_SIZE - 1u)) / WG_SIZE; // When the number of blocks exceeds the workgroup size, divide // the work evenly so each workgroup handles n_blocks / wg, with diff --git a/vello_shaders/shader/flatten.wgsl b/vello_shaders/shader/flatten.wgsl index 1de07910f..a4de16d90 100644 --- a/vello_shaders/shader/flatten.wgsl +++ b/vello_shaders/shader/flatten.wgsl @@ -359,8 +359,7 @@ fn flatten_euler( transform = local_to_device; let mat = transform.mat; - scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) + - length(vec2(mat.x - mat.w, mat.y + mat.z)); + scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) + length(vec2(mat.x - mat.w, mat.y + mat.z)); } // Drop zero length lines. This is an exact equality test because dropping very short @@ -811,6 +810,9 @@ fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { + if config.cancelled != 0u { + return; + } let ix = global_id.x; pathdata_base = config.pathdata_base; bbox = vec4(1e31, 1e31, -1e31, -1e31); @@ -848,7 +850,7 @@ fn main( let offset_tangent = offset * normalize(tangent); let n = offset_tangent.yx * vec2f(-1., 1.); draw_cap(path_ix, (style_flags & STYLE_FLAGS_START_CAP_MASK) >> 2u, - pts.p0, pts.p0 - n, pts.p0 + n, -offset_tangent, transform); + pts.p0, pts.p0 - n, pts.p0 + n, -offset_tangent, transform); } else { // Don't draw anything if the path is closed. } @@ -878,11 +880,11 @@ fn main( if neighbor.do_join { draw_join(path_ix, style_flags, pts.p3, tan_prev, tan_next, - n_prev, n_next, transform); + n_prev, n_next, transform); } else { // Draw end cap. draw_cap(path_ix, (style_flags & STYLE_FLAGS_END_CAP_MASK), - pts.p3, pts.p3 + n_prev, pts.p3 - n_prev, offset_tangent, transform); + pts.p3, pts.p3 + n_prev, pts.p3 - n_prev, offset_tangent, transform); } } } else { diff --git a/vello_shaders/shader/path_count.wgsl b/vello_shaders/shader/path_count.wgsl index 7de89278d..a9e7c6171 100644 --- a/vello_shaders/shader/path_count.wgsl +++ b/vello_shaders/shader/path_count.wgsl @@ -52,6 +52,7 @@ let ROBUST_EPSILON: f32 = 2e-7; fn main( @builtin(global_invocation_id) global_id: vec3, ) { + // If the pipeline is cancelled, `path_count_setup` will not allocate any threads let n_lines = atomicLoad(&bump.lines); var count = 0u; if global_id.x < n_lines { diff --git a/vello_shaders/shader/path_tiling.wgsl b/vello_shaders/shader/path_tiling.wgsl index 63f982b87..eb659f8f3 100644 --- a/vello_shaders/shader/path_tiling.wgsl +++ b/vello_shaders/shader/path_tiling.wgsl @@ -40,6 +40,7 @@ let ROBUST_EPSILON: f32 = 2e-7; fn main( @builtin(global_invocation_id) global_id: vec3, ) { + // If the pipeline is cancelled, `path_tiling_setup` will not allocate any threads let n_segments = atomicLoad(&bump.seg_counts); if global_id.x < n_segments { let seg_count = seg_counts[global_id.x]; diff --git a/vello_shaders/shader/pathtag_reduce.wgsl b/vello_shaders/shader/pathtag_reduce.wgsl index ab6e10e98..f591e4d61 100644 --- a/vello_shaders/shader/pathtag_reduce.wgsl +++ b/vello_shaders/shader/pathtag_reduce.wgsl @@ -23,6 +23,9 @@ fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { + if config.cancelled != 0u { + return; + } let ix = global_id.x; let tag_word = scene[config.pathtag_base + ix]; var agg = reduce_tag(tag_word); diff --git a/vello_shaders/shader/pathtag_reduce2.wgsl b/vello_shaders/shader/pathtag_reduce2.wgsl index eb8621f0f..f2ce907de 100644 --- a/vello_shaders/shader/pathtag_reduce2.wgsl +++ b/vello_shaders/shader/pathtag_reduce2.wgsl @@ -23,6 +23,7 @@ fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, ) { + // TODO: Cancel if needed? let ix = global_id.x; var agg = reduced_in[ix]; sh_scratch[local_id.x] = agg; diff --git a/vello_shaders/shader/pathtag_scan.wgsl b/vello_shaders/shader/pathtag_scan.wgsl index 27a34bdf7..b460afc3e 100644 --- a/vello_shaders/shader/pathtag_scan.wgsl +++ b/vello_shaders/shader/pathtag_scan.wgsl @@ -31,6 +31,9 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + if config.cancelled != 0u { + return; + } #ifdef small var agg = tag_monoid_identity(); if local_id.x < wg_id.x { diff --git a/vello_shaders/shader/pathtag_scan1.wgsl b/vello_shaders/shader/pathtag_scan1.wgsl index 7f3b47659..10610f7c8 100644 --- a/vello_shaders/shader/pathtag_scan1.wgsl +++ b/vello_shaders/shader/pathtag_scan1.wgsl @@ -29,6 +29,7 @@ fn main( @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { + // TODO: Cancel if needed var agg = tag_monoid_identity(); if local_id.x < wg_id.x { agg = reduced2[local_id.x]; diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl new file mode 100644 index 000000000..ed6cdca39 --- /dev/null +++ b/vello_shaders/shader/prepare.wgsl @@ -0,0 +1,70 @@ +// Copyright 2022 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Determine whether the Vello pipeline is likely to fail during this run +// and therefore whether all later stages should be cancelled. + +#import config +#import bump + +@group(0) @binding(0) +var config: Config; + +@group(0) @binding(1) +// TODO: Use a non-atomic version of BumpAllocators? +var bump: BumpAllocators; + +@compute @workgroup_size(1) +fn main() { + var should_cancel = false; + let previous_failure = atomicLoad(&bump.failed); + if previous_failure == PREVIOUS_RUN { + // Don't early-exit from multiple frames in a row + // The CPU should be blocking on the frame which failed anyway, so this + // case should never be reached, but if the CPU side isn't doing that + // properly, we can try again. + // (Note that this check is simply an early-exit for this case, as all the + // bump values would have been reset to 0 anyway) + atomicStore(&bump.failed, 0u); + } else if previous_failure != 0u { + // If the previous frame failed (for another reason) + + // And we don't have enough memory to have run that previous frame + if config.lines_size < atomicLoad(&bump.lines) { + should_cancel = true; + } + if config.binning_size < atomicLoad(&bump.binning) { + should_cancel = true; + } + if config.ptcl_size < atomicLoad(&bump.ptcl) { + should_cancel = true; + } + if config.tiles_size < atomicLoad(&bump.tile) { + should_cancel = true; + } + if config.seg_counts_size < atomicLoad(&bump.seg_counts) { + should_cancel = true; + } + if config.segments_size < atomicLoad(&bump.segments) { + should_cancel = true; + } + if config.lines_size < atomicLoad(&bump.lines) { + should_cancel = true; + } + // config.blend_size < atomicLoad(&bump.blend) + if should_cancel { + // Then don't run this frame + config.cancelled = 1u; + atomicStore(&bump.failed, PREVIOUS_RUN); + } else { + atomicStore(&bump.failed, 0u); + } + } + atomicStore(&bump.binning, 0u); + atomicStore(&bump.ptcl, 0u); + atomicStore(&bump.tile, 0u); + atomicStore(&bump.seg_counts, 0u); + atomicStore(&bump.segments, 0u); + atomicStore(&bump.blend, 0u); + atomicStore(&bump.lines, 0u); +} diff --git a/vello_shaders/shader/shared/bump.wgsl b/vello_shaders/shader/shared/bump.wgsl index 9270fc2f8..54763d94f 100644 --- a/vello_shaders/shader/shared/bump.wgsl +++ b/vello_shaders/shader/shared/bump.wgsl @@ -7,6 +7,7 @@ let STAGE_TILE_ALLOC: u32 = 0x2u; let STAGE_FLATTEN: u32 = 0x4u; let STAGE_PATH_COUNT: u32 = 0x8u; let STAGE_COARSE: u32 = 0x10u; +let PREVIOUS_RUN: u32 = 0x20u; // This must be kept in sync with the struct in config.rs in the encoding crate. struct BumpAllocators { diff --git a/vello_shaders/shader/shared/config.wgsl b/vello_shaders/shader/shared/config.wgsl index 3391afd9a..95d68cc7c 100644 --- a/vello_shaders/shader/shared/config.wgsl +++ b/vello_shaders/shader/shared/config.wgsl @@ -32,6 +32,9 @@ struct Config { transform_base: u32, style_base: u32, + // Whether this stage has been cancelled at startup. 0 means uncancelled. + cancelled: u32, + // Sizes of bump allocated buffers (in element size units) lines_size: u32, binning_size: u32, diff --git a/vello_shaders/shader/tile_alloc.wgsl b/vello_shaders/shader/tile_alloc.wgsl index c6073d128..2f32f8668 100644 --- a/vello_shaders/shader/tile_alloc.wgsl +++ b/vello_shaders/shader/tile_alloc.wgsl @@ -41,7 +41,7 @@ fn main( // We need to check only prior stages, as if this stage has failed in another workgroup, // we still want to know this workgroup's memory requirement. if local_id.x == 0u { - let failed = (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_FLATTEN)) != 0u; + let failed = (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_FLATTEN | PREVIOUS_RUN)) != 0u; sh_previous_failed = u32(failed); } let failed = workgroupUniformLoad(&sh_previous_failed); From 4309b593f42e5852672e0682fd34bf1a4510f574 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:56:23 +0100 Subject: [PATCH 03/18] Give slightly better error output `rust-analyzer` will eagerly eat the error output otherwise --- vello_shaders/build.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vello_shaders/build.rs b/vello_shaders/build.rs index 79abde655..57930b26a 100644 --- a/vello_shaders/build.rs +++ b/vello_shaders/build.rs @@ -24,7 +24,10 @@ fn main() { let mut shaders = match compile::ShaderInfo::from_default() { Ok(s) => s, Err(err) => { - eprintln!("{err}"); + let formatted = err.to_string(); + for line in formatted.lines() { + println!("cargo:warning={line}"); + } return; } }; From 123f60d5f710049a975032169b9f3c32ec2d8c2d Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Fri, 7 Jun 2024 15:24:07 +0100 Subject: [PATCH 04/18] Fix CPU handling and remove empty file --- vello/src/lib.rs | 41 +++++++++++++++++++++++++------ vello/src/render.rs | 8 +++--- vello/src/robust.rs | 12 --------- vello/src/shaders.rs | 2 +- vello_shaders/shader/prepare.wgsl | 1 + vello_shaders/src/cpu.rs | 2 ++ vello_shaders/src/cpu/prepare.rs | 23 +++++++++++++++++ 7 files changed, 65 insertions(+), 24 deletions(-) delete mode 100644 vello/src/robust.rs create mode 100644 vello_shaders/src/cpu/prepare.rs diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 85affba29..6d57a6de0 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -84,7 +84,6 @@ mod debug; mod recording; mod render; -mod robust; mod scene; mod shaders; #[cfg(feature = "wgpu")] @@ -123,6 +122,7 @@ pub use shaders::FullShaders; #[cfg(feature = "wgpu")] use vello_encoding::Resolver; +use wgpu::{Buffer, BufferUsages}; #[cfg(feature = "wgpu")] use wgpu_engine::{ExternalResource, WgpuEngine}; @@ -252,6 +252,7 @@ pub struct Renderer { blit: Option, #[cfg(feature = "debug_layers")] debug: Option, + bump: Option, target: Option, #[cfg(feature = "wgpu-profiler")] pub profiler: GpuProfiler, @@ -351,6 +352,7 @@ impl Renderer { #[cfg(feature = "debug_layers")] debug, target: None, + bump: None, // Use 3 pending frames #[cfg(feature = "wgpu-profiler")] profiler: GpuProfiler::new(GpuProfilerSettings { @@ -388,17 +390,42 @@ impl Renderer { texture: &TextureView, params: &RenderParams, ) -> Result<()> { - let (recording, target) = + let (mut recording, target, bump_buf) = render::render_full(scene, &mut self.resolver, &self.shaders, params); - let external_resources = [ExternalResource::Image( - *target.as_image().unwrap(), - texture, - )]; + let cpu_external; + let gpu_external; + let gpu_bump; + let external_resources: &[ExternalResource] = if self.options.use_cpu { + // HACK: Our handling of buffers across CPU and GPU is not great + + // We don't retrain the bump buffer if we're using CPU shaders + // This is because some of stages might still be running on the + // GPU, and we can't easily get the bump buffer back to the CPU + // from the GPU + recording.free_buffer(bump_buf); + cpu_external = [ExternalResource::Image(target, texture)]; + &cpu_external + } else { + gpu_bump = self.bump.get_or_insert_with(|| { + device.create_buffer(&wgpu::BufferDescriptor { + label: Some("bump"), + size: bump_buf.size, + usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, + mapped_at_creation: false, + }) + }); + gpu_external = [ + ExternalResource::Image(target, texture), + ExternalResource::Buffer(bump_buf, &gpu_bump), + ]; + &gpu_external + }; + self.engine.run_recording( device, queue, &recording, - &external_resources, + external_resources, "render_to_texture", #[cfg(feature = "wgpu-profiler")] &mut self.profiler, diff --git a/vello/src/render.rs b/vello/src/render.rs index d6d2280dd..1ee235de8 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -82,7 +82,7 @@ pub(crate) fn render_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, -) -> (Recording, ResourceProxy) { +) -> (Recording, ImageProxy, BufferProxy) { render_encoding_full(scene.encoding(), resolver, shaders, params) } @@ -96,12 +96,13 @@ pub(crate) fn render_encoding_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, -) -> (Recording, ResourceProxy) { +) -> (Recording, ImageProxy, BufferProxy) { let mut render = Render::new(); let mut recording = render.render_encoding_coarse(encoding, resolver, shaders, params, false); let out_image = render.out_image(); + let bump_buf = render.bump_buf(); render.record_fine(shaders, &mut recording); - (recording, out_image.into()) + (recording, out_image, bump_buf) } impl Default for Render { @@ -196,7 +197,6 @@ impl Render { "reduced_buf", ); let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); - recording.clear_all(bump_buf); let bump_buf = ResourceProxy::Buffer(bump_buf); recording.dispatch(shaders.prepare, (1, 1, 1), [config_buf, bump_buf]); // TODO: really only need pathtag_wgs - 1 diff --git a/vello/src/robust.rs b/vello/src/robust.rs deleted file mode 100644 index 05ed18b30..000000000 --- a/vello/src/robust.rs +++ /dev/null @@ -1,12 +0,0 @@ -//! A discussion of Vello's robust dynamic memory support -//! -//! When running the Vello pipeline, there are several buffers which: -//! 1) Need to be large enough to store -//! 2) Have a size which is non-trivial to calculate before running the pipeline -//! -//! When using wgpu (and most GPU apis), it is not possible for the GPU to synchronously -//! request a larger buffer, so we have to provide a best-effort buffer for this purpose. -//! -//! ## Handling failures -//! -//! If the buffer which was provided was too small, we have an issue. diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index e0550b5a2..1124888ef 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -102,7 +102,7 @@ pub(crate) fn full_shaders( }; } - let prepare = add_shader!(prepare, [Buffer, Buffer], CpuShaderType::Skipped); + let prepare = add_shader!(prepare, [Buffer, Buffer]); let pathtag_reduce = add_shader!(pathtag_reduce, [Uniform, BufReadOnly, Buffer]); let pathtag_reduce2 = add_shader!( pathtag_reduce2, diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl index ed6cdca39..93ed8c447 100644 --- a/vello_shaders/shader/prepare.wgsl +++ b/vello_shaders/shader/prepare.wgsl @@ -3,6 +3,7 @@ // Determine whether the Vello pipeline is likely to fail during this run // and therefore whether all later stages should be cancelled. +// This enables reduced latency in most failure cases #import config #import bump diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index a129fce59..2266b283c 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -29,6 +29,7 @@ mod pathtag_reduce; mod pathtag_scan; mod tile_alloc; mod util; +mod prepare; pub use backdrop::backdrop; pub use bbox_clear::bbox_clear; @@ -46,6 +47,7 @@ pub use path_tiling_setup::path_tiling_setup; pub use pathtag_reduce::pathtag_reduce; pub use pathtag_scan::pathtag_scan; pub use tile_alloc::tile_alloc; +pub use prepare::prepare; use std::cell::{Ref, RefCell, RefMut}; use std::ops::{Deref, DerefMut}; diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs new file mode 100644 index 000000000..81750b0ac --- /dev/null +++ b/vello_shaders/src/cpu/prepare.rs @@ -0,0 +1,23 @@ +use vello_encoding::{BumpAllocators, ConfigUniform}; + +use super::CpuBinding; + +fn prepare_main(_config: &ConfigUniform, bump: &mut BumpAllocators) { + // We don't yet do robust bump handling in the CPU shaders, so do the minimal version for this shader + bump.binning = 0; + bump.ptcl = 0; + bump.tile = 0; + bump.seg_counts = 0; + bump.segments = 0; + bump.blend = 0; + bump.lines = 0; + bump.failed = 0; +} + +pub fn prepare(_n_wg: u32, resources: &[CpuBinding]) { + // On the GPU, config is mutable, but our CPU runner doesn't allow accessing uploaded buffers as mutable + // This is a hack + let config = resources[0].as_typed(); + let mut bump = resources[1].as_typed_mut(); + prepare_main(&config, &mut bump); +} From dd506d569919f7f84b82eda54142398467ee5b6a Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:40:42 +0100 Subject: [PATCH 05/18] Get the hackiest version of robust memory working --- vello/src/lib.rs | 123 ++++++++++++++++++++++++++++++++--- vello/src/render.rs | 48 ++++++++++---- vello/src/wgpu_engine.rs | 10 +++ vello_encoding/src/config.rs | 97 +++++++++++++++++---------- vello_encoding/src/lib.rs | 4 +- vello_shaders/src/cpu.rs | 4 +- 6 files changed, 223 insertions(+), 63 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 6d57a6de0..d9a0c0c06 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -89,6 +89,10 @@ mod shaders; #[cfg(feature = "wgpu")] mod wgpu_engine; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; #[cfg(feature = "wgpu")] use std::{num::NonZeroUsize, sync::Arc}; @@ -102,6 +106,7 @@ pub use skrifa; pub mod glyph; +use vello_encoding::{BufferSize, BumpBufferSizes}; #[cfg(feature = "wgpu")] pub use wgpu; @@ -122,7 +127,7 @@ pub use shaders::FullShaders; #[cfg(feature = "wgpu")] use vello_encoding::Resolver; -use wgpu::{Buffer, BufferUsages}; +use wgpu::SubmissionIndex; #[cfg(feature = "wgpu")] use wgpu_engine::{ExternalResource, WgpuEngine}; @@ -130,7 +135,7 @@ pub use debug::DebugLayers; /// Temporary export, used in `with_winit` for stats pub use vello_encoding::BumpAllocators; #[cfg(feature = "wgpu")] -use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView}; +use wgpu::{Buffer, BufferUsages, Device, Queue, SurfaceTexture, TextureFormat, TextureView}; #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))] use wgpu_profiler::{GpuProfiler, GpuProfilerSettings}; @@ -241,6 +246,8 @@ pub enum Error { #[allow(dead_code)] // this can be unused when wgpu feature is not used pub(crate) type Result = std::result::Result; +type BumpSubmission = (SubmissionIndex, Buffer, Arc); + /// Renders a scene into a texture or surface. #[cfg(feature = "wgpu")] pub struct Renderer { @@ -252,8 +259,12 @@ pub struct Renderer { blit: Option, #[cfg(feature = "debug_layers")] debug: Option, - bump: Option, target: Option, + // Fields for robust dynamic memory + bump: Option, + previous_submission: Option, + previouser_submission: Option, + bump_sizes: BumpBufferSizes, #[cfg(feature = "wgpu-profiler")] pub profiler: GpuProfiler, #[cfg(feature = "wgpu-profiler")] @@ -353,6 +364,9 @@ impl Renderer { debug, target: None, bump: None, + previous_submission: None, + previouser_submission: None, + bump_sizes: Default::default(), // Use 3 pending frames #[cfg(feature = "wgpu-profiler")] profiler: GpuProfiler::new(GpuProfilerSettings { @@ -389,9 +403,14 @@ impl Renderer { scene: &Scene, texture: &TextureView, params: &RenderParams, - ) -> Result<()> { - let (mut recording, target, bump_buf) = - render::render_full(scene, &mut self.resolver, &self.shaders, params); + ) -> Result> { + let (mut recording, target, bump_buf) = render::render_full( + scene, + &mut self.resolver, + &self.shaders, + params, + self.bump_sizes, + ); let cpu_external; let gpu_external; let gpu_bump; @@ -416,7 +435,7 @@ impl Renderer { }); gpu_external = [ ExternalResource::Image(target, texture), - ExternalResource::Buffer(bump_buf, &gpu_bump), + ExternalResource::Buffer(bump_buf, gpu_bump), ]; &gpu_external }; @@ -430,7 +449,8 @@ impl Renderer { #[cfg(feature = "wgpu-profiler")] &mut self.profiler, )?; - Ok(()) + let bump_download = self.engine.take_download(bump_buf); + Ok(bump_download) } /// Renders a scene to the target surface. @@ -449,6 +469,67 @@ impl Renderer { surface: &SurfaceTexture, params: &RenderParams, ) -> Result<()> { + let buffer_completed = if let Some((idx, bump, completed)) = + self.previouser_submission.take() + { + // Ensure that we have the bump buffer from the rendering two frames ago + // The previous frame will have been cancelled if that is the case + + // Warning: Blocks! + device.poll(wgpu::MaintainBase::WaitForSubmissionIndex(idx)); + + if completed.swap(false, Ordering::Acquire) { + { + let slice = &bump.slice(..); + let data = slice.get_mapped_range(); + let data: BumpAllocators = bytemuck::pod_read_unaligned(&*data); + if data.failed != 0 { + if data.failed == 0x20 { + eprintln!( + "Run failed but next run will be retried, reallocated in last run" + ); + } else { + eprintln!("Previous run failed, need to reallocate"); + // TODO: Be smarter here, e.g. notice that we're over by a certain factor + // and bump several buffers? + + // TODO: Also reduce allocation sizes + // TODO: Free buffers which haven't been used in "a while" + + // TODO: This ignore the draw tag length? + if data.binning > self.bump_sizes.bin_data.len() { + self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2); + } + if data.lines > self.bump_sizes.lines.len() { + self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4); + } + // if data.blend > self.bump_sizes.? // TODO + if data.ptcl > self.bump_sizes.ptcl.len() { + self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 5 / 4); + } + if data.seg_counts > self.bump_sizes.seg_counts.len() { + self.bump_sizes.seg_counts = + BufferSize::new(data.seg_counts * 5 / 4); + } + if data.tile > self.bump_sizes.tiles.len() { + self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4); + } + if data.segments > self.bump_sizes.segments.len() { + self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4); + } + } + } + } + bump.unmap(); + // TODO: Return `bump` into the engine's pool + } else { + // Downloading the buffer failed; we just assume that we can keep going? + } + completed + } else { + Arc::new(AtomicBool::new(false)) + }; + let width = params.width; let height = params.height; let mut target = self @@ -460,7 +541,7 @@ impl Renderer { if target.width != width || target.height != height { target = TargetTexture::new(device, width, height); } - self.render_to_texture(device, queue, scene, &target.view, params)?; + let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?; let blit = self .blit .as_ref() @@ -497,7 +578,28 @@ impl Renderer { "blit (render_to_surface)", #[cfg(feature = "wgpu-profiler")] &mut self.profiler, - )?; + ); + if let Some(download) = &bump_download { + let completed = buffer_completed.clone(); + download + .slice(..) + .map_async(wgpu::MapMode::Read, move |res| match res { + Ok(()) => { + completed.store(true, Ordering::Release); + } + Err(e) => { + log::warn!("Failed to map bump buffer: {e}"); + } + }); + }; + let idx = queue.submit(Some(encoder.finish())); + if let Some(download) = bump_download { + self.previouser_submission = + self.previous_submission + .replace((idx, download, buffer_completed)); + } else { + self.previouser_submission = self.previous_submission.take(); + } self.target = Some(target); #[cfg(feature = "wgpu-profiler")] { @@ -606,6 +708,7 @@ impl Renderer { &mut self.resolver, &self.shaders, params, + Default::default(), robust, ); let target = render.out_image(); diff --git a/vello/src/render.rs b/vello/src/render.rs index 1ee235de8..4918946ec 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -13,7 +13,9 @@ use crate::{AaConfig, RenderParams}; #[cfg(feature = "wgpu")] use crate::Scene; -use vello_encoding::{make_mask_lut, make_mask_lut_16, Encoding, Resolver, WorkgroupSize}; +use vello_encoding::{ + make_mask_lut, make_mask_lut_16, BumpBufferSizes, Encoding, Resolver, WorkgroupSize, +}; /// State for a render in progress. pub struct Render { @@ -82,8 +84,9 @@ pub(crate) fn render_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, + bump_sizes: BumpBufferSizes, ) -> (Recording, ImageProxy, BufferProxy) { - render_encoding_full(scene.encoding(), resolver, shaders, params) + render_encoding_full(scene.encoding(), resolver, shaders, params, bump_sizes) } #[cfg(feature = "wgpu")] @@ -96,9 +99,11 @@ pub(crate) fn render_encoding_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, + bump_sizes: BumpBufferSizes, ) -> (Recording, ImageProxy, BufferProxy) { let mut render = Render::new(); - let mut recording = render.render_encoding_coarse(encoding, resolver, shaders, params, false); + let mut recording = + render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, true); let out_image = render.out_image(); let bump_buf = render.bump_buf(); render.record_fine(shaders, &mut recording); @@ -132,6 +137,7 @@ impl Render { resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, + bump_sizes: BumpBufferSizes, robust: bool, ) -> Recording { use vello_encoding::RenderConfig; @@ -167,8 +173,13 @@ impl Render { for image in images.images { recording.write_image(image_atlas, image.1, image.2, image.0.clone()); } - let cpu_config = - RenderConfig::new(&layout, params.width, params.height, ¶ms.base_color); + let cpu_config = RenderConfig::new( + &layout, + params.width, + params.height, + ¶ms.base_color, + bump_sizes, + ); let buffer_sizes = &cpu_config.buffer_sizes; let wg_counts = &cpu_config.workgroup_counts; @@ -184,14 +195,21 @@ impl Render { recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)), ); let info_bin_data_buf = ResourceProxy::new_buf( - buffer_sizes.bin_data.size_in_bytes() as u64, + buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64, "info_bin_data_buf", ); - let tile_buf = - ResourceProxy::new_buf(buffer_sizes.tiles.size_in_bytes().into(), "tile_buf"); - let segments_buf = - ResourceProxy::new_buf(buffer_sizes.segments.size_in_bytes().into(), "segments_buf"); - let ptcl_buf = ResourceProxy::new_buf(buffer_sizes.ptcl.size_in_bytes().into(), "ptcl_buf"); + let tile_buf = ResourceProxy::new_buf( + buffer_sizes.bump_buffers.tiles.size_in_bytes().into(), + "tile_buf", + ); + let segments_buf = ResourceProxy::new_buf( + buffer_sizes.bump_buffers.segments.size_in_bytes().into(), + "segments_buf", + ); + let ptcl_buf = ResourceProxy::new_buf( + buffer_sizes.bump_buffers.ptcl.size_in_bytes().into(), + "ptcl_buf", + ); let reduced_buf = ResourceProxy::new_buf( buffer_sizes.path_reduced.size_in_bytes().into(), "reduced_buf", @@ -259,8 +277,10 @@ impl Render { wg_counts.bbox_clear, [config_buf, path_bbox_buf], ); - let lines_buf = - ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf"); + let lines_buf = ResourceProxy::new_buf( + buffer_sizes.bump_buffers.lines.size_in_bytes().into(), + "lines_buf", + ); recording.dispatch( shaders.flatten, wg_counts.flatten, @@ -391,7 +411,7 @@ impl Render { [bump_buf, indirect_count_buf.into()], ); let seg_counts_buf = ResourceProxy::new_buf( - buffer_sizes.seg_counts.size_in_bytes().into(), + buffer_sizes.bump_buffers.seg_counts.size_in_bytes().into(), "seg_counts_buf", ); recording.dispatch_indirect( diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs index fa3e80b3d..adb2ec8ed 100644 --- a/vello/src/wgpu_engine.rs +++ b/vello/src/wgpu_engine.rs @@ -685,6 +685,12 @@ impl WgpuEngine { let src_buf = self .bind_map .get_gpu_buf(proxy.id) + .or_else(|| { + transient_map.bufs.get(&proxy.id).and_then(|it| match it { + TransientBuf::Cpu(_) => None, + TransientBuf::Gpu(buf) => Some(*buf), + }) + }) .ok_or(Error::UnavailableBufferUsed(proxy.name, "download"))?; let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST; let buf = self.pool.get_buf(proxy.size, "download", usage, device); @@ -748,6 +754,10 @@ impl WgpuEngine { self.downloads.get(&buf.id) } + pub fn take_download(&mut self, buf: BufferProxy) -> Option { + self.downloads.remove(&buf.id) + } + pub fn free_download(&mut self, buf: BufferProxy) { self.downloads.remove(&buf.id); } diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index db5bb92c8..24e0aae3c 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -170,7 +170,13 @@ pub struct RenderConfig { } impl RenderConfig { - pub fn new(layout: &Layout, width: u32, height: u32, base_color: &peniko::Color) -> Self { + pub fn new( + layout: &Layout, + width: u32, + height: u32, + base_color: &peniko::Color, + bump_buffers: BumpBufferSizes, + ) -> Self { let new_width = width.next_multiple_of(TILE_WIDTH); let new_height = height.next_multiple_of(TILE_HEIGHT); let width_in_tiles = new_width / TILE_WIDTH; @@ -178,7 +184,7 @@ impl RenderConfig { let n_path_tags = layout.path_tags_size(); let workgroup_counts = WorkgroupCounts::new(layout, width_in_tiles, height_in_tiles, n_path_tags); - let buffer_sizes = BufferSizes::new(layout, &workgroup_counts); + let buffer_sizes = BufferSizes::new(layout, &workgroup_counts, bump_buffers); Self { gpu: ConfigUniform { width_in_tiles, @@ -187,13 +193,13 @@ impl RenderConfig { target_width: width, target_height: height, base_color: base_color.to_premul_u32(), - lines_size: buffer_sizes.lines.len(), - binning_size: buffer_sizes.bin_data.len() - layout.bin_data_start, - tiles_size: buffer_sizes.tiles.len(), - seg_counts_size: buffer_sizes.seg_counts.len(), - segments_size: buffer_sizes.segments.len(), - blend_size: buffer_sizes.blend_spill.len(), - ptcl_size: buffer_sizes.ptcl.len(), + lines_size: buffer_sizes.bump_buffers.lines.len(), + binning_size: buffer_sizes.bump_buffers.bin_data.len() - layout.bin_data_start, + tiles_size: buffer_sizes.bump_buffers.tiles.len(), + seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(), + segments_size: buffer_sizes.bump_buffers.segments.len(), + blend_size: buffer_sizes.bump_buffers.blend_spill.len(), + ptcl_size: buffer_sizes.bump_buffers.ptcl.len(), layout: *layout, }, workgroup_counts, @@ -333,6 +339,16 @@ impl PartialOrd for BufferSize { self.len.partial_cmp(&other.len) } } +#[derive(Copy, Clone, Debug)] +pub struct BumpBufferSizes { + pub lines: BufferSize, + pub bin_data: BufferSize, + pub tiles: BufferSize, + pub seg_counts: BufferSize, + pub segments: BufferSize, + pub ptcl: BufferSize, + pub blend_spill: BufferSize, +} /// Computed sizes for all buffers. #[derive(Copy, Clone, Debug, Default)] @@ -356,17 +372,46 @@ pub struct BufferSizes { pub bin_headers: BufferSize, pub paths: BufferSize, // Bump allocated buffers - pub lines: BufferSize, - pub bin_data: BufferSize, - pub tiles: BufferSize, - pub seg_counts: BufferSize, - pub segments: BufferSize, - pub blend_spill: BufferSize, - pub ptcl: BufferSize, + pub bump_buffers: BumpBufferSizes, +} + +impl BumpBufferSizes { + fn new() -> Self { + // The following buffer sizes have been hand picked to accommodate the vello test scenes as + // well as paris-30k. These should instead get derived from the scene layout using + // reasonable heuristics. + let bin_data = BufferSize::new(1 << 18); + let tiles = BufferSize::new(1 << 21); + let lines = BufferSize::new(1 << 21); + let seg_counts = BufferSize::new(1 << 21); + let segments = BufferSize::new(1 << 21); + let ptcl = BufferSize::new(1 << 23); + // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills. + let blend_spill = BufferSize::new(1 << 20); + BumpBufferSizes { + bin_data, + lines, + ptcl, + seg_counts, + segments, + tiles, + blend_spill, + } + } +} + +impl Default for BumpBufferSizes { + fn default() -> Self { + Self::new() + } } impl BufferSizes { - pub fn new(layout: &Layout, workgroups: &WorkgroupCounts) -> Self { + pub fn new( + layout: &Layout, + workgroups: &WorkgroupCounts, + bump_buffers: BumpBufferSizes, + ) -> Self { let n_paths = layout.n_paths; let n_draw_objects = layout.n_draw_objects; let n_clips = layout.n_clips; @@ -397,17 +442,6 @@ impl BufferSizes { let n_paths_aligned = align_up(n_paths, 256); let paths = BufferSize::new(n_paths_aligned); - // The following buffer sizes have been hand picked to accommodate the vello test scenes as - // well as paris-30k. These should instead get derived from the scene layout using - // reasonable heuristics. - let bin_data = BufferSize::new(1 << 18); - let tiles = BufferSize::new(1 << 21); - let lines = BufferSize::new(1 << 21); - let seg_counts = BufferSize::new(1 << 21); - let segments = BufferSize::new(1 << 21); - // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills. - let blend_spill = BufferSize::new(1 << 20); - let ptcl = BufferSize::new(1 << 23); Self { path_reduced, path_reduced2, @@ -424,15 +458,8 @@ impl BufferSizes { draw_bboxes, bump_alloc, indirect_count, - lines, bin_headers, paths, - bin_data, - tiles, - seg_counts, - segments, - blend_spill, - ptcl, } } } diff --git a/vello_encoding/src/lib.rs b/vello_encoding/src/lib.rs index 30db95000..8e1109308 100644 --- a/vello_encoding/src/lib.rs +++ b/vello_encoding/src/lib.rs @@ -27,8 +27,8 @@ mod resolve; pub use binning::BinHeader; pub use clip::{Clip, ClipBbox, ClipBic, ClipElement}; pub use config::{ - BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, ConfigUniform, IndirectCount, - RenderConfig, WorkgroupCounts, WorkgroupSize, + BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, BumpBufferSizes, ConfigUniform, + IndirectCount, RenderConfig, WorkgroupCounts, WorkgroupSize, }; pub use draw::{ DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index 2266b283c..30ca367f3 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -27,9 +27,9 @@ mod path_tiling; mod path_tiling_setup; mod pathtag_reduce; mod pathtag_scan; +mod prepare; mod tile_alloc; mod util; -mod prepare; pub use backdrop::backdrop; pub use bbox_clear::bbox_clear; @@ -46,8 +46,8 @@ pub use path_tiling::path_tiling; pub use path_tiling_setup::path_tiling_setup; pub use pathtag_reduce::pathtag_reduce; pub use pathtag_scan::pathtag_scan; -pub use tile_alloc::tile_alloc; pub use prepare::prepare; +pub use tile_alloc::tile_alloc; use std::cell::{Ref, RefCell, RefMut}; use std::ops::{Deref, DerefMut}; From f07391de6a52c99b97dc390fdf8e42f63c37b1d8 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:43:27 +0100 Subject: [PATCH 06/18] Fix CPU runner and clippy --- vello/src/lib.rs | 2 +- vello/src/render.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index d9a0c0c06..9e6097100 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -482,7 +482,7 @@ impl Renderer { { let slice = &bump.slice(..); let data = slice.get_mapped_range(); - let data: BumpAllocators = bytemuck::pod_read_unaligned(&*data); + let data: BumpAllocators = bytemuck::pod_read_unaligned(&data); if data.failed != 0 { if data.failed == 0x20 { eprintln!( diff --git a/vello/src/render.rs b/vello/src/render.rs index 4918946ec..dd9ed7d77 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -490,7 +490,8 @@ impl Render { image_atlas: ResourceProxy::Image(image_atlas), out_image, }); - if robust { + // TODO: This second check is a massive hack + if robust && !shaders.pathtag_is_cpu { recording.download(*bump_buf.as_buf().unwrap()); } recording.free_resource(bump_buf); From 0fd3097c1ef7129a7aebf408a9ca789143432d66 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:57:15 +0100 Subject: [PATCH 07/18] Fixup some broken behaviour and add comments --- examples/scenes/src/mmark.rs | 3 ++- vello/src/lib.rs | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/scenes/src/mmark.rs b/examples/scenes/src/mmark.rs index 3deea8d79..d05a4d7bb 100644 --- a/examples/scenes/src/mmark.rs +++ b/examples/scenes/src/mmark.rs @@ -74,7 +74,8 @@ impl TestScene for MMark { let n = if c < 10 { (c + 1) * 1000 } else { - ((c - 8) * 10000).min(120_000) + // The 190_000 scene can't be uploaded due to wgpu's limits + ((c - 8) * 10000).min(180_000) }; self.resize(n); let mut rng = rand::thread_rng(); diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 9e6097100..2c459d6cd 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -489,7 +489,10 @@ impl Renderer { "Run failed but next run will be retried, reallocated in last run" ); } else { - eprintln!("Previous run failed, need to reallocate"); + eprintln!( + "Previous run failed, need to reallocate: {:x?}", + data.failed + ); // TODO: Be smarter here, e.g. notice that we're over by a certain factor // and bump several buffers? @@ -505,7 +508,8 @@ impl Renderer { } // if data.blend > self.bump_sizes.? // TODO if data.ptcl > self.bump_sizes.ptcl.len() { - self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 5 / 4); + // TODO: At 5/4, this doesn't work very well + self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2); } if data.seg_counts > self.bump_sizes.seg_counts.len() { self.bump_sizes.seg_counts = From fe2d6b9a83182d920ef27ce175f79310dc06299c Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:42:34 +0100 Subject: [PATCH 08/18] Push some debug logging and a few misc improvements --- vello/src/lib.rs | 55 +++++++++++++++++++++++++++---- vello/src/render.rs | 10 +++++- vello_encoding/src/config.rs | 14 ++++---- vello_shaders/shader/coarse.wgsl | 5 +-- vello_shaders/shader/prepare.wgsl | 5 +-- 5 files changed, 69 insertions(+), 20 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 2c459d6cd..c8d787a4f 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -484,43 +484,86 @@ impl Renderer { let data = slice.get_mapped_range(); let data: BumpAllocators = bytemuck::pod_read_unaligned(&data); if data.failed != 0 { - if data.failed == 0x20 { - eprintln!( + if data.failed & 0x20 != 0 { + log::debug!( "Run failed but next run will be retried, reallocated in last run" ); } else { - eprintln!( - "Previous run failed, need to reallocate: {:x?}", - data.failed - ); + // log::info!( + // "Previous run failed, need to reallocate: {:x?}", + // data.failed + // ); + // log::debug!("{:?}", data); // TODO: Be smarter here, e.g. notice that we're over by a certain factor // and bump several buffers? + let mut changed = false; // TODO: Also reduce allocation sizes // TODO: Free buffers which haven't been used in "a while" // TODO: This ignore the draw tag length? if data.binning > self.bump_sizes.bin_data.len() { + changed = true; + log::debug!( + "Resizing binning from {:?} to {:?}", + self.bump_sizes.bin_data, + data.binning + ); self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2); } if data.lines > self.bump_sizes.lines.len() { + changed = true; + log::debug!( + "Resizing lines from {:?} to {:?}", + self.bump_sizes.lines, + data.lines + ); self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4); } // if data.blend > self.bump_sizes.? // TODO if data.ptcl > self.bump_sizes.ptcl.len() { + changed = true; + log::debug!( + "Resizing ptcl from {:?} to {:?}", + self.bump_sizes.ptcl, + data.ptcl + ); // TODO: At 5/4, this doesn't work very well self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2); } if data.seg_counts > self.bump_sizes.seg_counts.len() { + changed = true; + log::debug!( + "Resizing seg_counts from {:?} to {:?}", + self.bump_sizes.seg_counts, + data.seg_counts + ); self.bump_sizes.seg_counts = BufferSize::new(data.seg_counts * 5 / 4); } if data.tile > self.bump_sizes.tiles.len() { + changed = true; + log::debug!( + "Resizing tiles from {:?} to {:?}", + self.bump_sizes.tiles, + data.tile + ); self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4); } if data.segments > self.bump_sizes.segments.len() { + changed = true; + log::debug!( + "Resizing segments from {:?} to {:?}", + self.bump_sizes.segments, + data.segments + ); self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4); } + if !changed { + log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed); + } else { + log::info!("Detected need for reallocation, and did reallocate {:x?}. Data {data:?}", data.failed); + } } } } diff --git a/vello/src/render.rs b/vello/src/render.rs index dd9ed7d77..8c441df5c 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -180,6 +180,13 @@ impl Render { ¶ms.base_color, bump_sizes, ); + // log::debug!("Config: {{ lines_size: {:?}, binning_size: {:?}, tiles_size: {:?}, seg_counts_size: {:?}, segments_size: {:?}, ptcl_size: {:?} }}", + // cpu_config.gpu.lines_size, + // cpu_config.gpu.binning_size, + // cpu_config.gpu.tiles_size, + // cpu_config.gpu.seg_counts_size, + // cpu_config.gpu.segments_size, + // cpu_config.gpu.ptcl_size); let buffer_sizes = &cpu_config.buffer_sizes; let wg_counts = &cpu_config.workgroup_counts; @@ -195,7 +202,8 @@ impl Render { recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)), ); let info_bin_data_buf = ResourceProxy::new_buf( - buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64, + buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64 + + (layout.bin_data_start as u64) * std::mem::size_of::() as u64, "info_bin_data_buf", ); let tile_buf = ResourceProxy::new_buf( diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 24e0aae3c..68ef34d9c 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -194,7 +194,7 @@ impl RenderConfig { target_height: height, base_color: base_color.to_premul_u32(), lines_size: buffer_sizes.bump_buffers.lines.len(), - binning_size: buffer_sizes.bump_buffers.bin_data.len() - layout.bin_data_start, + binning_size: buffer_sizes.bump_buffers.bin_data.len(), tiles_size: buffer_sizes.bump_buffers.tiles.len(), seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(), segments_size: buffer_sizes.bump_buffers.segments.len(), @@ -380,12 +380,12 @@ impl BumpBufferSizes { // The following buffer sizes have been hand picked to accommodate the vello test scenes as // well as paris-30k. These should instead get derived from the scene layout using // reasonable heuristics. - let bin_data = BufferSize::new(1 << 18); - let tiles = BufferSize::new(1 << 21); - let lines = BufferSize::new(1 << 21); - let seg_counts = BufferSize::new(1 << 21); - let segments = BufferSize::new(1 << 21); - let ptcl = BufferSize::new(1 << 23); + let bin_data = BufferSize::new(1 << 12); + let tiles = BufferSize::new(1 << 15); + let lines = BufferSize::new(1 << 15); + let seg_counts = BufferSize::new(1 << 15); + let segments = BufferSize::new(1 << 15); + let ptcl = BufferSize::new(1 << 17); // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills. let blend_spill = BufferSize::new(1 << 20); BumpBufferSizes { diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl index ba0edfa8f..19e4e240f 100644 --- a/vello_shaders/shader/coarse.wgsl +++ b/vello_shaders/shader/coarse.wgsl @@ -68,10 +68,7 @@ var cmd_limit: u32; // Make sure there is space for a command of given size, plus a jump if needed fn alloc_cmd(size: u32) { if cmd_offset + size >= cmd_limit { - // We might be able to save a little bit of computation here - // by setting the initial value of the bump allocator. - let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC; - var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT); + var new_cmd = atomicAdd(&bump.ptcl, PTCL_INCREMENT); if new_cmd + PTCL_INCREMENT > config.ptcl_size { // This sets us up for technical UB, as lots of threads will be writing // to the same locations. But I think it's fine, and predicating the diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl index 93ed8c447..412113288 100644 --- a/vello_shaders/shader/prepare.wgsl +++ b/vello_shaders/shader/prepare.wgsl @@ -7,6 +7,7 @@ #import config #import bump +#import ptcl @group(0) @binding(0) var config: Config; @@ -19,7 +20,7 @@ var bump: BumpAllocators; fn main() { var should_cancel = false; let previous_failure = atomicLoad(&bump.failed); - if previous_failure == PREVIOUS_RUN { + if (previous_failure & PREVIOUS_RUN) != 0 { // Don't early-exit from multiple frames in a row // The CPU should be blocking on the frame which failed anyway, so this // case should never be reached, but if the CPU side isn't doing that @@ -62,7 +63,7 @@ fn main() { } } atomicStore(&bump.binning, 0u); - atomicStore(&bump.ptcl, 0u); + atomicStore(&bump.ptcl, config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC); atomicStore(&bump.tile, 0u); atomicStore(&bump.seg_counts, 0u); atomicStore(&bump.segments, 0u); From cc010176ee499201f751ca03885381f30691d898 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Thu, 13 Jun 2024 14:58:14 +0100 Subject: [PATCH 09/18] Fixup handling to not be busted --- examples/scenes/src/mmark.rs | 2 +- vello/src/lib.rs | 75 +++++++++------ vello/src/render.rs | 15 ++- vello/src/wgpu_engine.rs | 10 ++ vello_encoding/src/config.rs | 123 ++++++++++++------------ vello_encoding/src/lib.rs | 4 +- vello_encoding/src/resolve.rs | 2 + vello_shaders/shader/coarse.wgsl | 10 +- vello_shaders/shader/prepare.wgsl | 9 +- vello_shaders/shader/shared/config.wgsl | 3 + vello_shaders/src/cpu.rs | 2 +- vello_shaders/src/cpu/coarse.rs | 4 +- 12 files changed, 144 insertions(+), 115 deletions(-) diff --git a/examples/scenes/src/mmark.rs b/examples/scenes/src/mmark.rs index d05a4d7bb..042484bbb 100644 --- a/examples/scenes/src/mmark.rs +++ b/examples/scenes/src/mmark.rs @@ -75,7 +75,7 @@ impl TestScene for MMark { (c + 1) * 1000 } else { // The 190_000 scene can't be uploaded due to wgpu's limits - ((c - 8) * 10000).min(180_000) + ((c - 8) * 10000).min(190_000) }; self.resize(n); let mut rng = rand::thread_rng(); diff --git a/vello/src/lib.rs b/vello/src/lib.rs index c8d787a4f..3215bfd74 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -106,7 +106,6 @@ pub use skrifa; pub mod glyph; -use vello_encoding::{BufferSize, BumpBufferSizes}; #[cfg(feature = "wgpu")] pub use wgpu; @@ -264,7 +263,7 @@ pub struct Renderer { bump: Option, previous_submission: Option, previouser_submission: Option, - bump_sizes: BumpBufferSizes, + bump_sizes: BumpAllocators, #[cfg(feature = "wgpu-profiler")] pub profiler: GpuProfiler, #[cfg(feature = "wgpu-profiler")] @@ -366,7 +365,7 @@ impl Renderer { bump: None, previous_submission: None, previouser_submission: None, - bump_sizes: Default::default(), + bump_sizes: BumpAllocators::default(), // Use 3 pending frames #[cfg(feature = "wgpu-profiler")] profiler: GpuProfiler::new(GpuProfilerSettings { @@ -500,64 +499,76 @@ impl Renderer { let mut changed = false; // TODO: Also reduce allocation sizes // TODO: Free buffers which haven't been used in "a while" + // TODO: We should have awareness of the maximum binding size supported by the device + // That's easy for all buffers but lines and ptcl - // TODO: This ignore the draw tag length? - if data.binning > self.bump_sizes.bin_data.len() { + if data.binning > self.bump_sizes.binning { changed = true; + let new_size = data.binning * 5 / 4; log::debug!( - "Resizing binning from {:?} to {:?}", - self.bump_sizes.bin_data, - data.binning + "Resizing binning to {:?} (Needed {:?}, had {:?})", + new_size, + data.binning, + self.bump_sizes.binning, ); - self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2); + self.bump_sizes.binning = new_size; } - if data.lines > self.bump_sizes.lines.len() { + if data.lines > self.bump_sizes.lines { changed = true; + let new_size = data.lines * 5 / 4; log::debug!( - "Resizing lines from {:?} to {:?}", + "Resizing lines to {:?} (Needed {:?}, had {:?})", + new_size, + data.lines, self.bump_sizes.lines, - data.lines ); - self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4); + self.bump_sizes.lines = new_size; } // if data.blend > self.bump_sizes.? // TODO - if data.ptcl > self.bump_sizes.ptcl.len() { + if data.ptcl > self.bump_sizes.ptcl { changed = true; + // TODO: At 5/4, this doesn't work very well + let new_size = data.ptcl * 5 / 4; log::debug!( - "Resizing ptcl from {:?} to {:?}", + "Resizing ptcl to {:?} (Needed {:?}, had {:?})", + new_size, + data.ptcl, self.bump_sizes.ptcl, - data.ptcl ); - // TODO: At 5/4, this doesn't work very well - self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2); + self.bump_sizes.ptcl = new_size; } - if data.seg_counts > self.bump_sizes.seg_counts.len() { + if data.seg_counts > self.bump_sizes.seg_counts { changed = true; + let new_size = data.seg_counts * 5 / 4; log::debug!( - "Resizing seg_counts from {:?} to {:?}", + "Resizing seg_counts to {:?} (Needed {:?}, had {:?})", + new_size, + data.seg_counts, self.bump_sizes.seg_counts, - data.seg_counts ); - self.bump_sizes.seg_counts = - BufferSize::new(data.seg_counts * 5 / 4); + self.bump_sizes.seg_counts = new_size; } - if data.tile > self.bump_sizes.tiles.len() { + if data.tile > self.bump_sizes.tile { changed = true; + let new_size = data.tile * 5 / 4; log::debug!( - "Resizing tiles from {:?} to {:?}", - self.bump_sizes.tiles, - data.tile + "Resizing tile to {:?} (Needed {:?}, had {:?})", + new_size, + data.tile, + self.bump_sizes.tile, ); - self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4); + self.bump_sizes.tile = new_size; } - if data.segments > self.bump_sizes.segments.len() { + if data.segments > self.bump_sizes.segments { changed = true; + let new_size = data.segments * 5 / 4; log::debug!( - "Resizing segments from {:?} to {:?}", + "Resizing segments to {:?} (Needed {:?}, had {:?})", + new_size, + data.segments, self.bump_sizes.segments, - data.segments ); - self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4); + self.bump_sizes.segments = new_size; } if !changed { log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed); diff --git a/vello/src/render.rs b/vello/src/render.rs index 8c441df5c..c45ba6296 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -14,7 +14,7 @@ use crate::{AaConfig, RenderParams}; use crate::Scene; use vello_encoding::{ - make_mask_lut, make_mask_lut_16, BumpBufferSizes, Encoding, Resolver, WorkgroupSize, + make_mask_lut, make_mask_lut_16, BumpAllocators, Encoding, Resolver, WorkgroupSize, }; /// State for a render in progress. @@ -84,7 +84,7 @@ pub(crate) fn render_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, - bump_sizes: BumpBufferSizes, + bump_sizes: BumpAllocators, ) -> (Recording, ImageProxy, BufferProxy) { render_encoding_full(scene.encoding(), resolver, shaders, params, bump_sizes) } @@ -99,7 +99,7 @@ pub(crate) fn render_encoding_full( resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, - bump_sizes: BumpBufferSizes, + bump_sizes: BumpAllocators, ) -> (Recording, ImageProxy, BufferProxy) { let mut render = Render::new(); let mut recording = @@ -137,7 +137,7 @@ impl Render { resolver: &mut Resolver, shaders: &FullShaders, params: &RenderParams, - bump_sizes: BumpBufferSizes, + bump_sizes: BumpAllocators, robust: bool, ) -> Recording { use vello_encoding::RenderConfig; @@ -174,7 +174,7 @@ impl Render { recording.write_image(image_atlas, image.1, image.2, image.0.clone()); } let cpu_config = RenderConfig::new( - &layout, + layout, params.width, params.height, ¶ms.base_color, @@ -202,12 +202,11 @@ impl Render { recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)), ); let info_bin_data_buf = ResourceProxy::new_buf( - buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64 - + (layout.bin_data_start as u64) * std::mem::size_of::() as u64, + buffer_sizes.bump_buffers.binning.size_in_bytes() as u64, "info_bin_data_buf", ); let tile_buf = ResourceProxy::new_buf( - buffer_sizes.bump_buffers.tiles.size_in_bytes().into(), + buffer_sizes.bump_buffers.tile.size_in_bytes().into(), "tile_buf", ); let segments_buf = ResourceProxy::new_buf( diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs index adb2ec8ed..d53044540 100644 --- a/vello/src/wgpu_engine.rs +++ b/vello/src/wgpu_engine.rs @@ -952,6 +952,16 @@ impl ResourcePool { device: &Device, ) -> Buffer { let rounded_size = Self::size_class(size, SIZE_CLASS_BITS); + // let max_storage_buffer_binding_size = + // device.limits().max_storage_buffer_binding_size.into(); + // if rounded_size > max_storage_buffer_binding_size { + // if size < max_storage_buffer_binding_size { + // log::warn!("Would allocate buffer {name} to be larger than {max_storage_buffer_binding_size}. Clamped"); + // rounded_size = max_storage_buffer_binding_size; + // } else { + // log::warn!("Would allocate buffer {name} to be larger than {max_storage_buffer_binding_size}, which is not allowed"); + // } + // } let props = BufferProperties { size: rounded_size, usages: usage, diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 68ef34d9c..6aeca0d46 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -14,6 +14,7 @@ const TILE_WIDTH: u32 = 16; const TILE_HEIGHT: u32 = 16; // TODO: Obtain these from the vello_shaders crate +pub const PTCL_INITIAL_ALLOC: u32 = 64; pub(crate) const PATH_REDUCE_WG: u32 = 256; const PATH_BBOX_WG: u32 = 256; const FLATTEN_WG: u32 = 256; @@ -22,6 +23,9 @@ const CLIP_REDUCE_WG: u32 = 256; /// Counters for tracking dynamic allocation on the GPU. /// /// This must be kept in sync with the struct in `shader/shared/bump.wgsl` +/// +/// These values do *not* include any pre-allocated sections which use the same +/// underlying buffers. #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] #[repr(C)] pub struct BumpAllocators { @@ -37,9 +41,8 @@ pub struct BumpAllocators { pub lines: u32, } -#[derive(Default)] +#[derive(Default, Copy, Clone, Debug)] pub struct BumpAllocatorMemory { - pub total: u32, pub binning: BufferSize, pub ptcl: BufferSize, pub tile: BufferSize, @@ -49,20 +52,14 @@ pub struct BumpAllocatorMemory { } impl BumpAllocators { - pub fn memory(&self) -> BumpAllocatorMemory { - let binning = BufferSize::new(self.binning); - let ptcl = BufferSize::new(self.ptcl); + pub fn memory(&self, layout: &Layout) -> BumpAllocatorMemory { + let binning = BufferSize::new(self.binning + layout.bin_data_start); + let ptcl = BufferSize::new(self.ptcl + layout.ptcl_dyn_start); let tile = BufferSize::new(self.tile); let seg_counts = BufferSize::new(self.seg_counts); let segments = BufferSize::new(self.segments); let lines = BufferSize::new(self.lines); BumpAllocatorMemory { - total: binning.size_in_bytes() - + ptcl.size_in_bytes() - + tile.size_in_bytes() - + seg_counts.size_in_bytes() - + segments.size_in_bytes() - + lines.size_in_bytes(), binning, ptcl, tile, @@ -73,8 +70,26 @@ impl BumpAllocators { } } +impl BumpAllocatorMemory { + /// Get the total memory used by the bump buffers + pub fn total(&self) -> u64 { + [ + self.binning.size_in_bytes(), + self.ptcl.size_in_bytes(), + self.tile.size_in_bytes(), + self.seg_counts.size_in_bytes(), + self.segments.size_in_bytes(), + self.lines.size_in_bytes(), + ] + .into_iter() + .map(u64::from) + .sum() + } +} + impl std::fmt::Display for BumpAllocatorMemory { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let total = self.total(); write!( f, "\n \ @@ -85,9 +100,9 @@ impl std::fmt::Display for BumpAllocatorMemory { \tSegment Counts:\t\t{} elements ({} bytes)\n\ \tSegments:\t\t{} elements ({} bytes)\n\ \tLines:\t\t\t{} elements ({} bytes)", - self.total, - self.total as f32 / (1 << 10) as f32, - self.total as f32 / (1 << 20) as f32, + total, + total as f32 / (1 << 10) as f32, + total as f32 / (1 << 20) as f32, self.binning.len(), self.binning.size_in_bytes(), self.ptcl.len(), @@ -138,6 +153,7 @@ pub struct ConfigUniform { /// Layout of packed scene data. pub layout: Layout, /// Whether this stage has been cancelled at startup due to a predicted + /// memory allocation failure. /// /// Will be set by the `prepare` stage, and so should always be 0 on CPU. pub cancelled: u32, @@ -171,20 +187,21 @@ pub struct RenderConfig { impl RenderConfig { pub fn new( - layout: &Layout, + mut layout: Layout, width: u32, height: u32, base_color: &peniko::Color, - bump_buffers: BumpBufferSizes, + bump_buffers: BumpAllocators, ) -> Self { let new_width = width.next_multiple_of(TILE_WIDTH); let new_height = height.next_multiple_of(TILE_HEIGHT); let width_in_tiles = new_width / TILE_WIDTH; let height_in_tiles = new_height / TILE_HEIGHT; + layout.ptcl_dyn_start = width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC; let n_path_tags = layout.path_tags_size(); let workgroup_counts = - WorkgroupCounts::new(layout, width_in_tiles, height_in_tiles, n_path_tags); - let buffer_sizes = BufferSizes::new(layout, &workgroup_counts, bump_buffers); + WorkgroupCounts::new(&layout, width_in_tiles, height_in_tiles, n_path_tags); + let buffer_sizes = BufferSizes::new(&layout, &workgroup_counts, bump_buffers); Self { gpu: ConfigUniform { width_in_tiles, @@ -193,14 +210,14 @@ impl RenderConfig { target_width: width, target_height: height, base_color: base_color.to_premul_u32(), - lines_size: buffer_sizes.bump_buffers.lines.len(), - binning_size: buffer_sizes.bump_buffers.bin_data.len(), - tiles_size: buffer_sizes.bump_buffers.tiles.len(), - seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(), - segments_size: buffer_sizes.bump_buffers.segments.len(), - blend_size: buffer_sizes.bump_buffers.blend_spill.len(), - ptcl_size: buffer_sizes.bump_buffers.ptcl.len(), - layout: *layout, + lines_size: bump_buffers.lines, + binning_size: bump_buffers.binning, + tiles_size: bump_buffers.tile, + seg_counts_size: bump_buffers.seg_counts, + segments_size: bump_buffers.segments, + blend_size: buffer_sizes.bump_buffers.blend_spill, + ptcl_size: bump_buffers.ptcl, + layout, }, workgroup_counts, buffer_sizes, @@ -339,17 +356,6 @@ impl PartialOrd for BufferSize { self.len.partial_cmp(&other.len) } } -#[derive(Copy, Clone, Debug)] -pub struct BumpBufferSizes { - pub lines: BufferSize, - pub bin_data: BufferSize, - pub tiles: BufferSize, - pub seg_counts: BufferSize, - pub segments: BufferSize, - pub ptcl: BufferSize, - pub blend_spill: BufferSize, -} - /// Computed sizes for all buffers. #[derive(Copy, Clone, Debug, Default)] pub struct BufferSizes { @@ -372,45 +378,41 @@ pub struct BufferSizes { pub bin_headers: BufferSize, pub paths: BufferSize, // Bump allocated buffers - pub bump_buffers: BumpBufferSizes, + pub bump_buffers: BumpAllocatorMemory, } -impl BumpBufferSizes { - fn new() -> Self { - // The following buffer sizes have been hand picked to accommodate the vello test scenes as - // well as paris-30k. These should instead get derived from the scene layout using - // reasonable heuristics. - let bin_data = BufferSize::new(1 << 12); - let tiles = BufferSize::new(1 << 15); - let lines = BufferSize::new(1 << 15); - let seg_counts = BufferSize::new(1 << 15); - let segments = BufferSize::new(1 << 15); - let ptcl = BufferSize::new(1 << 17); +impl BumpAllocators { + /// The initial sizes which should be used for the bump buffers. + pub fn initial_sizes() -> Self { + // The following buffer sizes have been picked to accommodate small scenes + // and which will grow as needed. + let binning = 1 << 12; + let tile = 1 << 15; + let lines = 1 << 15; + let seg_counts = 1 << 15; + let segments = 1 << 15; + let ptcl = 1 << 17; // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills. - let blend_spill = BufferSize::new(1 << 20); - BumpBufferSizes { - bin_data, + let blend_spill = 1 << 20; + BumpAllocators { + binning, lines, ptcl, seg_counts, segments, - tiles, + tile, blend_spill, + blend: 0, + failed: 0, } } } -impl Default for BumpBufferSizes { - fn default() -> Self { - Self::new() - } -} - impl BufferSizes { pub fn new( layout: &Layout, workgroups: &WorkgroupCounts, - bump_buffers: BumpBufferSizes, + bump_buffers: BumpAllocators, ) -> Self { let n_paths = layout.n_paths; let n_draw_objects = layout.n_draw_objects; @@ -441,6 +443,7 @@ impl BufferSizes { let bin_headers = BufferSize::new(binning_wgs * 256); let n_paths_aligned = align_up(n_paths, 256); let paths = BufferSize::new(n_paths_aligned); + let bump_buffers = bump_buffers.memory(layout); Self { path_reduced, diff --git a/vello_encoding/src/lib.rs b/vello_encoding/src/lib.rs index 8e1109308..4c1831191 100644 --- a/vello_encoding/src/lib.rs +++ b/vello_encoding/src/lib.rs @@ -27,8 +27,8 @@ mod resolve; pub use binning::BinHeader; pub use clip::{Clip, ClipBbox, ClipBic, ClipElement}; pub use config::{ - BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, BumpBufferSizes, ConfigUniform, - IndirectCount, RenderConfig, WorkgroupCounts, WorkgroupSize, + BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, ConfigUniform, IndirectCount, + RenderConfig, WorkgroupCounts, WorkgroupSize, PTCL_INITIAL_ALLOC, }; pub use draw::{ DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, diff --git a/vello_encoding/src/resolve.rs b/vello_encoding/src/resolve.rs index 626e79c84..e437c86c9 100644 --- a/vello_encoding/src/resolve.rs +++ b/vello_encoding/src/resolve.rs @@ -29,6 +29,8 @@ pub struct Layout { pub n_clips: u32, /// Start of binning data. pub bin_data_start: u32, + /// Start of dynamically allocated ptcl commands + pub ptcl_dyn_start: u32, /// Start of path tag stream. pub path_tag_base: u32, /// Start of path data stream. diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl index 19e4e240f..a829405f9 100644 --- a/vello_shaders/shader/coarse.wgsl +++ b/vello_shaders/shader/coarse.wgsl @@ -76,6 +76,7 @@ fn alloc_cmd(size: u32) { new_cmd = 0u; atomicOr(&bump.failed, STAGE_COARSE); } + new_cmd += config.ptcl_dyn_start; ptcl[cmd_offset] = CMD_JUMP; ptcl[cmd_offset + 1u] = new_cmd; cmd_offset = new_cmd; @@ -90,6 +91,11 @@ fn write_path(tile: Tile, tile_ix: u32, draw_flags: u32) { let n_segs = tile.segment_count_or_ix; if n_segs != 0u { var seg_ix = atomicAdd(&bump.segments, n_segs); + if seg_ix > config.segments_size { + // All writes into segments happen in path_tiling, so we don't need + // to avoid UB in this shader + atomicOr(&bump.failed, STAGE_COARSE); + } tiles[tile_ix].segment_count_or_ix = ~seg_ix; alloc_cmd(4u); ptcl[cmd_offset] = CMD_FILL; @@ -157,7 +163,7 @@ fn main( failed |= STAGE_PATH_COUNT; } // Reuse sh_part_count to hold failed flag, shmem is tight - sh_part_count[0] = u32(failed); + sh_part_count[0] = failed; } let failed = workgroupUniformLoad(&sh_part_count[0]); if failed != 0u { @@ -406,7 +412,7 @@ fn main( case DRAWTAG_END_CLIP: { clip_depth -= 1u; // A clip shape is always a non-zero fill (draw_flags=0). - write_path(tile, tile_ix, /*draw_flags=*/0u); + write_path(tile, tile_ix, /* draw_flags,= */ 0u); let blend = scene[dd]; let alpha = bitcast(scene[dd + 1u]); write_end_clip(CmdEndClip(blend, alpha)); diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl index 412113288..01006afd6 100644 --- a/vello_shaders/shader/prepare.wgsl +++ b/vello_shaders/shader/prepare.wgsl @@ -1,4 +1,4 @@ -// Copyright 2022 the Vello Authors +// Copyright 2024 the Vello Authors // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // Determine whether the Vello pipeline is likely to fail during this run @@ -20,7 +20,7 @@ var bump: BumpAllocators; fn main() { var should_cancel = false; let previous_failure = atomicLoad(&bump.failed); - if (previous_failure & PREVIOUS_RUN) != 0 { + if (previous_failure & PREVIOUS_RUN) != 0u { // Don't early-exit from multiple frames in a row // The CPU should be blocking on the frame which failed anyway, so this // case should never be reached, but if the CPU side isn't doing that @@ -32,9 +32,6 @@ fn main() { // If the previous frame failed (for another reason) // And we don't have enough memory to have run that previous frame - if config.lines_size < atomicLoad(&bump.lines) { - should_cancel = true; - } if config.binning_size < atomicLoad(&bump.binning) { should_cancel = true; } @@ -63,7 +60,7 @@ fn main() { } } atomicStore(&bump.binning, 0u); - atomicStore(&bump.ptcl, config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC); + atomicStore(&bump.ptcl, 0u); atomicStore(&bump.tile, 0u); atomicStore(&bump.seg_counts, 0u); atomicStore(&bump.segments, 0u); diff --git a/vello_shaders/shader/shared/config.wgsl b/vello_shaders/shader/shared/config.wgsl index 95d68cc7c..c3e9ceed4 100644 --- a/vello_shaders/shader/shared/config.wgsl +++ b/vello_shaders/shader/shared/config.wgsl @@ -21,6 +21,9 @@ struct Config { // To reduce the number of bindings, info and bin data are combined // into one buffer. bin_data_start: u32, + // Each tile gets a pre-allocated area of ptcl; the dynamically allocated + // section is after this. + ptcl_dyn_start: u32, // offsets within scene buffer (in u32 units) pathtag_base: u32, diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index 30ca367f3..e0ecff483 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -169,7 +169,7 @@ pub struct CpuTexture { // Common internal definitions -const PTCL_INITIAL_ALLOC: u32 = 64; +pub use vello_encoding::PTCL_INITIAL_ALLOC; // Tags for PTCL commands const CMD_END: u32 = 0; diff --git a/vello_shaders/src/cpu/coarse.rs b/vello_shaders/src/cpu/coarse.rs index 88ec6036d..c28907ea3 100644 --- a/vello_shaders/src/cpu/coarse.rs +++ b/vello_shaders/src/cpu/coarse.rs @@ -52,10 +52,8 @@ impl TileState { ptcl: &mut [u32], ) { if self.cmd_offset + size >= self.cmd_limit { - let ptcl_dyn_start = - config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC; let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM); - let new_cmd = ptcl_dyn_start + bump.ptcl; + let new_cmd = config.layout.ptcl_dyn_start + bump.ptcl; bump.ptcl += chunk_size; ptcl[self.cmd_offset as usize] = CMD_JUMP; ptcl[self.cmd_offset as usize + 1] = new_cmd; From 1928122d77b997ff90a96f416c0e5f390a0787f0 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:06:37 +0100 Subject: [PATCH 10/18] Move subset to a helper function --- examples/with_winit/src/stats.rs | 2 +- vello/src/lib.rs | 189 ++++++++++++++------------ vello/src/render.rs | 2 +- vello_encoding/src/config.rs | 10 +- vello_shaders/shader/shared/bump.wgsl | 2 +- vello_shaders/src/cpu/coarse.rs | 4 +- vello_shaders/src/cpu/prepare.rs | 2 +- 7 files changed, 113 insertions(+), 98 deletions(-) diff --git a/examples/with_winit/src/stats.rs b/examples/with_winit/src/stats.rs index 3b5339dee..0524322f3 100644 --- a/examples/with_winit/src/stats.rs +++ b/examples/with_winit/src/stats.rs @@ -75,7 +75,7 @@ impl Snapshot { labels.push(format!("ptcl: {}", bump.ptcl)); labels.push(format!("tile: {}", bump.tile)); labels.push(format!("segments: {}", bump.segments)); - labels.push(format!("blend: {}", bump.blend)); + labels.push(format!("blend: {}", bump.blend_spill)); } // height / 2 is dedicated to the text labels and the rest is filled by the bar graph. diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 3215bfd74..a942660bf 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -89,12 +89,14 @@ mod shaders; #[cfg(feature = "wgpu")] mod wgpu_engine; -use std::sync::{ - atomic::{AtomicBool, Ordering}, - Arc, -}; #[cfg(feature = "wgpu")] -use std::{num::NonZeroUsize, sync::Arc}; +use std::{ + num::NonZeroUsize, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; /// Styling and composition primitives. pub use peniko; @@ -416,7 +418,7 @@ impl Renderer { let external_resources: &[ExternalResource] = if self.options.use_cpu { // HACK: Our handling of buffers across CPU and GPU is not great - // We don't retrain the bump buffer if we're using CPU shaders + // We don't retain the bump buffer if we're using CPU shaders // This is because some of stages might still be running on the // GPU, and we can't easily get the bump buffer back to the CPU // from the GPU @@ -468,6 +470,97 @@ impl Renderer { surface: &SurfaceTexture, params: &RenderParams, ) -> Result<()> { + let buffer_completed = self.block_on_bump_and_reallocate(device); + + let width = params.width; + let height = params.height; + let mut target = self + .target + .take() + .unwrap_or_else(|| TargetTexture::new(device, width, height)); + // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator + // during resize, specifically on metal. + if target.width != width || target.height != height { + target = TargetTexture::new(device, width, height); + } + let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?; + let blit = self + .blit + .as_ref() + .expect("renderer should have configured surface_format to use on a surface"); + let mut encoder = + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + let mut recording = Recording::default(); + let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format)); + let surface_proxy = ImageProxy::new( + width, + height, + ImageFormat::from_wgpu(surface.texture.format()), + ); + recording.draw(recording::DrawParams { + shader_id: blit.0, + instance_count: 1, + vertex_count: 6, + vertex_buffer: None, + resources: vec![ResourceProxy::Image(target_proxy)], + target: surface_proxy, + clear_color: Some([0., 0., 0., 0.]), + }); + let surface_view = surface + .texture + .create_view(&wgpu::TextureViewDescriptor::default()); + let external_resources = [ + ExternalResource::Image(target_proxy, &target.view), + ExternalResource::Image(surface_proxy, &surface_view), + ]; + self.engine.run_recording( + device, + queue, + &recording, + &external_resources, + "blit (render_to_surface)", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; + #[cfg(feature = "wgpu-profiler")] + self.profiler.resolve_queries(&mut encoder); + if let Some(download) = &bump_download { + let completed = buffer_completed.clone(); + download + .slice(..) + .map_async(wgpu::MapMode::Read, move |res| match res { + Ok(()) => { + completed.store(true, Ordering::Release); + } + Err(e) => { + log::warn!("Failed to map bump buffer: {e}"); + } + }); + }; + let idx = queue.submit(Some(encoder.finish())); + if let Some(download) = bump_download { + self.previouser_submission = + self.previous_submission + .replace((idx, download, buffer_completed)); + } else { + self.previouser_submission = self.previous_submission.take(); + } + self.target = Some(target); + #[cfg(feature = "wgpu-profiler")] + { + self.profiler.end_frame().unwrap(); + if let Some(result) = self + .profiler + .process_finished_frame(queue.get_timestamp_period()) + { + self.profile_result = Some(result); + } + } + Ok(()) + } + + /// Wait for the frame "two frames ago"'s bump buffer to be available, and reallocate if so. + fn block_on_bump_and_reallocate(&mut self, device: &Device) -> Arc { let buffer_completed = if let Some((idx, bump, completed)) = self.previouser_submission.take() { @@ -587,89 +680,7 @@ impl Renderer { } else { Arc::new(AtomicBool::new(false)) }; - - let width = params.width; - let height = params.height; - let mut target = self - .target - .take() - .unwrap_or_else(|| TargetTexture::new(device, width, height)); - // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator - // during resize, specifically on metal. - if target.width != width || target.height != height { - target = TargetTexture::new(device, width, height); - } - let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?; - let blit = self - .blit - .as_ref() - .expect("renderer should have configured surface_format to use on a surface"); - let mut recording = Recording::default(); - let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format)); - let surface_proxy = ImageProxy::new( - width, - height, - ImageFormat::from_wgpu(surface.texture.format()), - ); - recording.draw(recording::DrawParams { - shader_id: blit.0, - instance_count: 1, - vertex_count: 6, - vertex_buffer: None, - resources: vec![ResourceProxy::Image(target_proxy)], - target: surface_proxy, - clear_color: Some([0., 0., 0., 0.]), - }); - - let surface_view = surface - .texture - .create_view(&wgpu::TextureViewDescriptor::default()); - let external_resources = [ - ExternalResource::Image(target_proxy, &target.view), - ExternalResource::Image(surface_proxy, &surface_view), - ]; - self.engine.run_recording( - device, - queue, - &recording, - &external_resources, - "blit (render_to_surface)", - #[cfg(feature = "wgpu-profiler")] - &mut self.profiler, - ); - if let Some(download) = &bump_download { - let completed = buffer_completed.clone(); - download - .slice(..) - .map_async(wgpu::MapMode::Read, move |res| match res { - Ok(()) => { - completed.store(true, Ordering::Release); - } - Err(e) => { - log::warn!("Failed to map bump buffer: {e}"); - } - }); - }; - let idx = queue.submit(Some(encoder.finish())); - if let Some(download) = bump_download { - self.previouser_submission = - self.previous_submission - .replace((idx, download, buffer_completed)); - } else { - self.previouser_submission = self.previous_submission.take(); - } - self.target = Some(target); - #[cfg(feature = "wgpu-profiler")] - { - self.profiler.end_frame().unwrap(); - if let Some(result) = self - .profiler - .process_finished_frame(queue.get_timestamp_period()) - { - self.profile_result = Some(result); - } - } - Ok(()) + buffer_completed } /// Reload the shaders. This should only be used during `vello` development diff --git a/vello/src/render.rs b/vello/src/render.rs index c45ba6296..1be4c18bd 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -480,7 +480,7 @@ impl Render { recording.free_resource(path_buf); let out_image = ImageProxy::new(params.width, params.height, ImageFormat::Rgba8); let blend_spill_buf = BufferProxy::new( - buffer_sizes.blend_spill.size_in_bytes().into(), + buffer_sizes.bump_buffers.blend_spill.size_in_bytes().into(), "blend_spill", ); self.fine_wg_count = Some(wg_counts.fine); diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 6aeca0d46..7226090f4 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -37,7 +37,7 @@ pub struct BumpAllocators { pub tile: u32, pub seg_counts: u32, pub segments: u32, - pub blend: u32, + pub blend_spill: u32, pub lines: u32, } @@ -48,6 +48,7 @@ pub struct BumpAllocatorMemory { pub tile: BufferSize, pub seg_counts: BufferSize, pub segments: BufferSize, + pub blend_spill: BufferSize, pub lines: BufferSize, } @@ -59,12 +60,14 @@ impl BumpAllocators { let seg_counts = BufferSize::new(self.seg_counts); let segments = BufferSize::new(self.segments); let lines = BufferSize::new(self.lines); + let blend_spill = BufferSize::new(self.blend_spill); BumpAllocatorMemory { binning, ptcl, tile, seg_counts, segments, + blend_spill, lines, } } @@ -79,6 +82,7 @@ impl BumpAllocatorMemory { self.tile.size_in_bytes(), self.seg_counts.size_in_bytes(), self.segments.size_in_bytes(), + self.blend_spill.size_in_bytes(), self.lines.size_in_bytes(), ] .into_iter() @@ -215,7 +219,7 @@ impl RenderConfig { tiles_size: bump_buffers.tile, seg_counts_size: bump_buffers.seg_counts, segments_size: bump_buffers.segments, - blend_size: buffer_sizes.bump_buffers.blend_spill, + blend_size: bump_buffers.blend_spill, ptcl_size: bump_buffers.ptcl, layout, }, @@ -402,7 +406,6 @@ impl BumpAllocators { segments, tile, blend_spill, - blend: 0, failed: 0, } } @@ -463,6 +466,7 @@ impl BufferSizes { indirect_count, bin_headers, paths, + bump_buffers, } } } diff --git a/vello_shaders/shader/shared/bump.wgsl b/vello_shaders/shader/shared/bump.wgsl index 54763d94f..1d732c04d 100644 --- a/vello_shaders/shader/shared/bump.wgsl +++ b/vello_shaders/shader/shared/bump.wgsl @@ -18,7 +18,7 @@ struct BumpAllocators { tile: atomic, seg_counts: atomic, segments: atomic, - blend: atomic, + blend_spill: atomic, lines: atomic, } diff --git a/vello_shaders/src/cpu/coarse.rs b/vello_shaders/src/cpu/coarse.rs index c28907ea3..e345ef138 100644 --- a/vello_shaders/src/cpu/coarse.rs +++ b/vello_shaders/src/cpu/coarse.rs @@ -354,8 +354,8 @@ fn coarse_main( ptcl[tile_state.cmd_offset as usize] = CMD_END; let scratch_size = (max_blend_depth.saturating_sub(BLEND_STACK_SPLIT)) * TILE_WIDTH * TILE_HEIGHT; - ptcl[blend_offset as usize] = bump.blend; - bump.blend += scratch_size; + ptcl[blend_offset as usize] = bump.blend_spill; + bump.blend_spill += scratch_size; } } } diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs index 81750b0ac..26feaf79a 100644 --- a/vello_shaders/src/cpu/prepare.rs +++ b/vello_shaders/src/cpu/prepare.rs @@ -9,7 +9,7 @@ fn prepare_main(_config: &ConfigUniform, bump: &mut BumpAllocators) { bump.tile = 0; bump.seg_counts = 0; bump.segments = 0; - bump.blend = 0; + bump.blend_spill = 0; bump.lines = 0; bump.failed = 0; } From 947408ec6edd1faeb65a73fb57cb5251ef15f179 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Mon, 17 Jun 2024 10:09:32 +0100 Subject: [PATCH 11/18] Move segments overflow checking to path_tiling_setup This should slightly reduce the amount of work required --- vello/src/render.rs | 2 +- vello/src/shaders.rs | 2 +- vello_shaders/shader/coarse.wgsl | 6 +----- vello_shaders/shader/path_tiling_setup.wgsl | 23 +++++++++++++++------ vello_shaders/src/cpu/path_tiling_setup.rs | 16 +++++++++----- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/vello/src/render.rs b/vello/src/render.rs index 1be4c18bd..293884c77 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -457,7 +457,7 @@ impl Render { recording.dispatch( shaders.path_tiling_setup, wg_counts.path_tiling_setup, - [bump_buf, indirect_count_buf.into(), ptcl_buf], + [config_buf, bump_buf, indirect_count_buf.into(), ptcl_buf], ); recording.dispatch_indirect( shaders.path_tiling, diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index 1124888ef..147d3cb14 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -196,7 +196,7 @@ pub(crate) fn full_shaders( Buffer, ] ); - let path_tiling_setup = add_shader!(path_tiling_setup, [Buffer, Buffer, Buffer]); + let path_tiling_setup = add_shader!(path_tiling_setup, [Uniform, Buffer, Buffer, Buffer]); let path_tiling = add_shader!( path_tiling, [ diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl index a829405f9..cc4c80028 100644 --- a/vello_shaders/shader/coarse.wgsl +++ b/vello_shaders/shader/coarse.wgsl @@ -90,12 +90,8 @@ fn write_path(tile: Tile, tile_ix: u32, draw_flags: u32) { // fine). let n_segs = tile.segment_count_or_ix; if n_segs != 0u { + // We check for overflow of bump.segments in path_tiling_setup var seg_ix = atomicAdd(&bump.segments, n_segs); - if seg_ix > config.segments_size { - // All writes into segments happen in path_tiling, so we don't need - // to avoid UB in this shader - atomicOr(&bump.failed, STAGE_COARSE); - } tiles[tile_ix].segment_count_or_ix = ~seg_ix; alloc_cmd(4u); ptcl[cmd_offset] = CMD_FILL; diff --git a/vello_shaders/shader/path_tiling_setup.wgsl b/vello_shaders/shader/path_tiling_setup.wgsl index 4d5bf2e30..7843c9d84 100644 --- a/vello_shaders/shader/path_tiling_setup.wgsl +++ b/vello_shaders/shader/path_tiling_setup.wgsl @@ -3,15 +3,19 @@ // Set up dispatch size for path tiling stage. +#import config #import bump @group(0) @binding(0) -var bump: BumpAllocators; +var config: Config; @group(0) @binding(1) -var indirect: IndirectCount; +var bump: BumpAllocators; @group(0) @binding(2) +var indirect: IndirectCount; + +@group(0) @binding(3) var ptcl: array; // Partition size for path tiling stage @@ -19,14 +23,21 @@ let WG_SIZE = 256u; @compute @workgroup_size(1) fn main() { - if atomicLoad(&bump.failed) != 0u { + indirect.count_y = 1u; + indirect.count_z = 1u; + let segments = atomicLoad(&bump.seg_counts); + let overflowed = segments > config.segments_size; + // If segments == config.segments_size, that's not an overflow + if atomicLoad(&bump.failed) != 0u || overflowed { + if overflowed { + // Report the failure so that the CPU and `prepare` know that we've failed + atomicOr(&bump.failed, STAGE_COARSE); + } + // Cancel path_tiling indirect.count_x = 0u; // signal fine rasterizer that failure happened (it doesn't bind bump) ptcl[0] = ~0u; } else { - let segments = atomicLoad(&bump.seg_counts); indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE; } - indirect.count_y = 1u; - indirect.count_z = 1u; } diff --git a/vello_shaders/src/cpu/path_tiling_setup.rs b/vello_shaders/src/cpu/path_tiling_setup.rs index 9b6303691..e087feba2 100644 --- a/vello_shaders/src/cpu/path_tiling_setup.rs +++ b/vello_shaders/src/cpu/path_tiling_setup.rs @@ -1,13 +1,18 @@ // Copyright 2023 the Vello Authors // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense -use vello_encoding::{BumpAllocators, IndirectCount}; +use vello_encoding::{BumpAllocators, ConfigUniform, IndirectCount}; use super::CpuBinding; const WG_SIZE: usize = 256; -fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { +fn path_tiling_setup_main( + // Config is needed to detect allocation failure + _config: &ConfigUniform, + bump: &BumpAllocators, + indirect: &mut IndirectCount, +) { let segments = bump.seg_counts; indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32; indirect.count_y = 1; @@ -15,8 +20,9 @@ fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) { } pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) { - let bump = resources[0].as_typed(); - let mut indirect = resources[1].as_typed_mut(); + let config = resources[0].as_typed(); + let bump = resources[1].as_typed(); + let mut indirect = resources[2].as_typed_mut(); // binding 2 is ptcl, which we would need if we propagate failure - path_tiling_setup_main(&bump, &mut indirect); + path_tiling_setup_main(&config, &bump, &mut indirect); } From cf62daeade62489811c7eb84dc2ac1509b734778 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:01:30 +0100 Subject: [PATCH 12/18] Fix compilation of bump estimator --- vello/src/scene.rs | 4 ++-- vello_encoding/src/estimate.rs | 7 +++---- vello_shaders/src/cpu/prepare.rs | 3 +++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vello/src/scene.rs b/vello/src/scene.rs index 103e9b222..594ec3ff5 100644 --- a/vello/src/scene.rs +++ b/vello/src/scene.rs @@ -14,7 +14,7 @@ use skrifa::{ GlyphId, MetadataProvider, OutlineGlyphCollection, }; #[cfg(feature = "bump_estimate")] -use vello_encoding::BumpAllocatorMemory; +use vello_encoding::BumpAllocators; use vello_encoding::{Encoding, Glyph, GlyphRun, Patch, Transform}; // TODO - Document invariants and edge cases (#470) @@ -49,7 +49,7 @@ impl Scene { /// Tally up the bump allocator estimate for the current state of the encoding, /// taking into account an optional `transform` applied to the entire scene. #[cfg(feature = "bump_estimate")] - pub fn bump_estimate(&self, transform: Option) -> BumpAllocatorMemory { + pub fn bump_estimate(&self, transform: Option) -> BumpAllocators { self.estimator .tally(transform.as_ref().map(Transform::from_kurbo).as_ref()) } diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs index 62b943013..6d1e87008 100644 --- a/vello_encoding/src/estimate.rs +++ b/vello_encoding/src/estimate.rs @@ -162,7 +162,7 @@ impl BumpEstimator { } /// Produce the final total, applying an optional transform to all content. - pub fn tally(&self, transform: Option<&Transform>) -> BumpAllocatorMemory { + pub fn tally(&self, transform: Option<&Transform>) -> BumpAllocators { let scale = transform_scale(transform); // The post-flatten line estimate. @@ -172,7 +172,7 @@ impl BumpEstimator { // segments as there are lines, in case `segments` was underestimated at small scales. let n_segments = ((self.segments as f64 * scale).ceil() as u32).max(lines); - let bump = BumpAllocators { + BumpAllocators { failed: 0, // TODO: we can provide a tighter bound here but for now we // assume that binning must be bounded by the segment count. @@ -183,8 +183,7 @@ impl BumpEstimator { seg_counts: n_segments, segments: n_segments, lines, - }; - bump.memory() + } } fn count_stroke_caps(&mut self, style: Cap, scaled_width: f64, count: u32) { diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs index 26feaf79a..306f05f9e 100644 --- a/vello_shaders/src/cpu/prepare.rs +++ b/vello_shaders/src/cpu/prepare.rs @@ -1,3 +1,6 @@ +// Copyright 2023 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + use vello_encoding::{BumpAllocators, ConfigUniform}; use super::CpuBinding; From f504d1295cc933a23f5ae359d25b6c51fe5c74cf Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:11:35 +0100 Subject: [PATCH 13/18] Fix incorrect wgpu mixing --- vello/src/lib.rs | 7 +++++-- vello_encoding/src/estimate.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index a942660bf..eeed381d3 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -128,7 +128,6 @@ pub use shaders::FullShaders; #[cfg(feature = "wgpu")] use vello_encoding::Resolver; -use wgpu::SubmissionIndex; #[cfg(feature = "wgpu")] use wgpu_engine::{ExternalResource, WgpuEngine}; @@ -136,7 +135,10 @@ pub use debug::DebugLayers; /// Temporary export, used in `with_winit` for stats pub use vello_encoding::BumpAllocators; #[cfg(feature = "wgpu")] -use wgpu::{Buffer, BufferUsages, Device, Queue, SurfaceTexture, TextureFormat, TextureView}; +use wgpu::{ + Buffer, BufferUsages, Device, PipelineCompilationOptions, Queue, SubmissionIndex, + SurfaceTexture, TextureFormat, TextureView, +}; #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))] use wgpu_profiler::{GpuProfiler, GpuProfilerSettings}; @@ -247,6 +249,7 @@ pub enum Error { #[allow(dead_code)] // this can be unused when wgpu feature is not used pub(crate) type Result = std::result::Result; +#[cfg(feature = "wgpu")] type BumpSubmission = (SubmissionIndex, Buffer, Arc); /// Renders a scene into a texture or surface. diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs index 6d1e87008..c36497d11 100644 --- a/vello_encoding/src/estimate.rs +++ b/vello_encoding/src/estimate.rs @@ -4,7 +4,7 @@ //! This utility provides conservative size estimation for buffer allocations backing //! GPU bump memory. This estimate relies on heuristics and naturally overestimates. -use super::{BumpAllocatorMemory, BumpAllocators, Transform}; +use super::{BumpAllocators, Transform}; use peniko::kurbo::{Cap, Join, PathEl, Point, Stroke, Vec2}; const RSQRT_OF_TOL: f64 = 2.2360679775; // tol = 0.2 From 692c7f82cc22b18772660af3d355fbd2ab868122 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:14:54 +0100 Subject: [PATCH 14/18] Use the initial sizes to let tests work --- vello/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index eeed381d3..9b56d06a4 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -370,7 +370,7 @@ impl Renderer { bump: None, previous_submission: None, previouser_submission: None, - bump_sizes: BumpAllocators::default(), + bump_sizes: BumpAllocators::initial_sizes(), // Use 3 pending frames #[cfg(feature = "wgpu-profiler")] profiler: GpuProfiler::new(GpuProfilerSettings { From 7d9c609fc1f33af0c28565d69907d9a29c7a20ec Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:34:23 +0100 Subject: [PATCH 15/18] Save some rebase cleanup --- vello/src/lib.rs | 8 +++++--- vello/src/render.rs | 7 ++++--- vello_shaders/shader/coarse.wgsl | 2 +- vello_shaders/shader/prepare.wgsl | 6 ++++-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 9b56d06a4..bb39c4240 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -136,8 +136,8 @@ pub use debug::DebugLayers; pub use vello_encoding::BumpAllocators; #[cfg(feature = "wgpu")] use wgpu::{ - Buffer, BufferUsages, Device, PipelineCompilationOptions, Queue, SubmissionIndex, - SurfaceTexture, TextureFormat, TextureView, + Buffer, BufferUsages, Device, Queue, SubmissionIndex, SurfaceTexture, TextureFormat, + TextureView, }; #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))] use wgpu_profiler::{GpuProfiler, GpuProfilerSettings}; @@ -491,7 +491,7 @@ impl Renderer { .blit .as_ref() .expect("renderer should have configured surface_format to use on a surface"); - let mut encoder = + let encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); let mut recording = Recording::default(); let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format)); @@ -526,6 +526,8 @@ impl Renderer { &mut self.profiler, )?; #[cfg(feature = "wgpu-profiler")] + let mut encoder = encoder; + #[cfg(feature = "wgpu-profiler")] self.profiler.resolve_queries(&mut encoder); if let Some(download) = &bump_download { let completed = buffer_completed.clone(); diff --git a/vello/src/render.rs b/vello/src/render.rs index 293884c77..a79951be2 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -103,7 +103,7 @@ pub(crate) fn render_encoding_full( ) -> (Recording, ImageProxy, BufferProxy) { let mut render = Render::new(); let mut recording = - render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, true); + render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, false); let out_image = render.out_image(); let bump_buf = render.bump_buf(); render.record_fine(shaders, &mut recording); @@ -497,8 +497,8 @@ impl Render { image_atlas: ResourceProxy::Image(image_atlas), out_image, }); - // TODO: This second check is a massive hack - if robust && !shaders.pathtag_is_cpu { + // TODO: This check is a massive hack to disable robustness if + if !shaders.pathtag_is_cpu { recording.download(*bump_buf.as_buf().unwrap()); } recording.free_resource(bump_buf); @@ -595,6 +595,7 @@ impl Render { recording.free_resource(fine.gradient_image); recording.free_resource(fine.image_atlas); recording.free_resource(fine.info_bin_data_buf); + recording.free_resource(fine.blend_spill_buf); // TODO: make mask buf persistent if let Some(mask_buf) = self.mask_buf.take() { recording.free_resource(mask_buf); diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl index cc4c80028..8048e07b1 100644 --- a/vello_shaders/shader/coarse.wgsl +++ b/vello_shaders/shader/coarse.wgsl @@ -444,7 +444,7 @@ fn main( var blend_ix = 0u; if max_blend_depth > BLEND_STACK_SPLIT { let scratch_size = (max_blend_depth - BLEND_STACK_SPLIT) * TILE_WIDTH * TILE_HEIGHT; - blend_ix = atomicAdd(&bump.blend, scratch_size); + blend_ix = atomicAdd(&bump.blend_spill, scratch_size); if blend_ix + scratch_size > config.blend_size { atomicOr(&bump.failed, STAGE_COARSE); } diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl index 01006afd6..530d43362 100644 --- a/vello_shaders/shader/prepare.wgsl +++ b/vello_shaders/shader/prepare.wgsl @@ -50,7 +50,9 @@ fn main() { if config.lines_size < atomicLoad(&bump.lines) { should_cancel = true; } - // config.blend_size < atomicLoad(&bump.blend) + if config.blend_size < atomicLoad(&bump.blend_spill) { + should_cancel = true; + } if should_cancel { // Then don't run this frame config.cancelled = 1u; @@ -64,6 +66,6 @@ fn main() { atomicStore(&bump.tile, 0u); atomicStore(&bump.seg_counts, 0u); atomicStore(&bump.segments, 0u); - atomicStore(&bump.blend, 0u); + atomicStore(&bump.blend_spill, 0u); atomicStore(&bump.lines, 0u); } From d2e97822df20a7a83a882fca6d67af2b221318be Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:36:25 +0100 Subject: [PATCH 16/18] Properly reallocate blend spill --- vello/src/lib.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index bb39c4240..797139831 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -668,6 +668,17 @@ impl Renderer { ); self.bump_sizes.segments = new_size; } + if data.blend_spill > self.bump_sizes.blend_spill { + changed = true; + let new_size = data.blend_spill * 5 / 4; + log::debug!( + "Resizing blend_spill to {:?} (Needed {:?}, had {:?})", + new_size, + data.blend_spill, + self.bump_sizes.blend_spill, + ); + self.bump_sizes.blend_spill = new_size; + } if !changed { log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed); } else { From d34f3db24ef88c21adc5f00dc3672459f688d9fb Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:07:29 +0100 Subject: [PATCH 17/18] If allocation fails, don't try and download a fake lines buffer --- vello/src/lib.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vello/src/lib.rs b/vello/src/lib.rs index 797139831..2fbcee784 100644 --- a/vello/src/lib.rs +++ b/vello/src/lib.rs @@ -894,17 +894,18 @@ impl Renderer { .as_ref() .expect("renderer should have configured surface_format to use on a surface"); let bump = result.bump.as_ref().unwrap(); - // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset. - let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?; - debug.render( - &mut recording, - surface_proxy, - &captured, - bump, - params, - &downloads, - ); - + if bump.failed == 0 { + // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset. + let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?; + debug.render( + &mut recording, + surface_proxy, + &captured, + bump, + params, + &downloads, + ); + } // TODO: this sucks. better to release everything in a helper // TODO: it would be much better to have a way to safely destroy a buffer. self.engine.free_download(captured.lines); From 83a5c286d412dc5476f8f10522abade57c9ba8c4 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:11:47 +0100 Subject: [PATCH 18/18] Fix clippy issues with features --- vello/src/render.rs | 2 +- vello_encoding/src/estimate.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vello/src/render.rs b/vello/src/render.rs index a79951be2..292a891d4 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -138,7 +138,7 @@ impl Render { shaders: &FullShaders, params: &RenderParams, bump_sizes: BumpAllocators, - robust: bool, + #[cfg_attr(not(feature = "debug_layers"), allow(unused))] robust: bool, ) -> Recording { use vello_encoding::RenderConfig; let mut recording = Recording::default(); diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs index c36497d11..7636401ab 100644 --- a/vello_encoding/src/estimate.rs +++ b/vello_encoding/src/estimate.rs @@ -179,7 +179,7 @@ impl BumpEstimator { binning: n_segments, ptcl: 0, tile: 0, - blend: 0, + blend_spill: 0, seg_counts: n_segments, segments: n_segments, lines,