From 2dda2008fac33c47077644358476dc5e66fdba6e Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Mon, 13 May 2024 15:27:59 +0200
Subject: [PATCH 01/18] Stash robustness thoughts

---
 vello/src/lib.rs    |  1 +
 vello/src/robust.rs | 12 ++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 vello/src/robust.rs

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 419b5ad93..85affba29 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -84,6 +84,7 @@
 mod debug;
 mod recording;
 mod render;
+mod robust;
 mod scene;
 mod shaders;
 #[cfg(feature = "wgpu")]
diff --git a/vello/src/robust.rs b/vello/src/robust.rs
new file mode 100644
index 000000000..05ed18b30
--- /dev/null
+++ b/vello/src/robust.rs
@@ -0,0 +1,12 @@
+//! A discussion of Vello's robust dynamic memory support
+//!
+//! When running the Vello pipeline, there are several buffers which:
+//! 1) Need to be large enough to store
+//! 2) Have a size which is non-trivial to calculate before running the pipeline
+//!
+//! When using wgpu (and most GPU apis), it is not possible for the GPU to synchronously
+//! request a larger buffer, so we have to provide a best-effort buffer for this purpose.
+//!
+//! ## Handling failures
+//!
+//! If the buffer which was provided was too small, we have an issue.

From 5e630b5a6e014a236c2a0f74d9d9da44dca7e753 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Thu, 6 Jun 2024 11:55:19 +0100
Subject: [PATCH 02/18] Setup early exit in all stages

---
 vello/src/render.rs                       |  7 ++-
 vello/src/shaders.rs                      |  3 +
 vello/src/wgpu_engine.rs                  |  4 +-
 vello_encoding/src/config.rs              |  5 ++
 vello_shaders/shader/backdrop.wgsl        |  3 +
 vello_shaders/shader/bbox_clear.wgsl      |  3 +
 vello_shaders/shader/clip_leaf.wgsl       |  3 +
 vello_shaders/shader/clip_reduce.wgsl     |  1 +
 vello_shaders/shader/coarse.wgsl          |  2 +-
 vello_shaders/shader/draw_leaf.wgsl       | 12 ++--
 vello_shaders/shader/draw_reduce.wgsl     |  3 +
 vello_shaders/shader/flatten.wgsl         | 12 ++--
 vello_shaders/shader/path_count.wgsl      |  1 +
 vello_shaders/shader/path_tiling.wgsl     |  1 +
 vello_shaders/shader/pathtag_reduce.wgsl  |  3 +
 vello_shaders/shader/pathtag_reduce2.wgsl |  1 +
 vello_shaders/shader/pathtag_scan.wgsl    |  3 +
 vello_shaders/shader/pathtag_scan1.wgsl   |  1 +
 vello_shaders/shader/prepare.wgsl         | 70 +++++++++++++++++++++++
 vello_shaders/shader/shared/bump.wgsl     |  1 +
 vello_shaders/shader/shared/config.wgsl   |  3 +
 vello_shaders/shader/tile_alloc.wgsl      |  2 +-
 22 files changed, 126 insertions(+), 18 deletions(-)
 create mode 100644 vello_shaders/shader/prepare.wgsl

diff --git a/vello/src/render.rs b/vello/src/render.rs
index 952ef498a..d6d2280dd 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -195,6 +195,10 @@ impl Render {
             buffer_sizes.path_reduced.size_in_bytes().into(),
             "reduced_buf",
         );
+        let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
+        recording.clear_all(bump_buf);
+        let bump_buf = ResourceProxy::Buffer(bump_buf);
+        recording.dispatch(shaders.prepare, (1, 1, 1), [config_buf, bump_buf]);
         // TODO: really only need pathtag_wgs - 1
         recording.dispatch(
             shaders.pathtag_reduce,
@@ -255,9 +259,6 @@ impl Render {
             wg_counts.bbox_clear,
             [config_buf, path_bbox_buf],
         );
-        let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
-        recording.clear_all(bump_buf);
-        let bump_buf = ResourceProxy::Buffer(bump_buf);
         let lines_buf =
             ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf");
         recording.dispatch(
diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs
index a58e0edfe..e0550b5a2 100644
--- a/vello/src/shaders.rs
+++ b/vello/src/shaders.rs
@@ -17,6 +17,7 @@ use crate::{
 
 // Shaders for the full pipeline
 pub struct FullShaders {
+    pub prepare: ShaderId,
     pub pathtag_reduce: ShaderId,
     pub pathtag_reduce2: ShaderId,
     pub pathtag_scan1: ShaderId,
@@ -101,6 +102,7 @@ pub(crate) fn full_shaders(
         };
     }
 
+    let prepare = add_shader!(prepare, [Buffer, Buffer], CpuShaderType::Skipped);
     let pathtag_reduce = add_shader!(pathtag_reduce, [Uniform, BufReadOnly, Buffer]);
     let pathtag_reduce2 = add_shader!(
         pathtag_reduce2,
@@ -249,6 +251,7 @@ pub(crate) fn full_shaders(
     };
 
     Ok(FullShaders {
+        prepare,
         pathtag_reduce,
         pathtag_reduce2,
         pathtag_scan,
diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs
index b10cbac5c..fa3e80b3d 100644
--- a/vello/src/wgpu_engine.rs
+++ b/vello/src/wgpu_engine.rs
@@ -409,7 +409,9 @@ impl WgpuEngine {
                     transient_map
                         .bufs
                         .insert(buf_proxy.id, TransientBuf::Cpu(bytes));
-                    let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST;
+                    // TODO: More principled way of working out usages
+                    let usage =
+                        BufferUsages::UNIFORM | BufferUsages::COPY_DST | BufferUsages::STORAGE;
                     // Same consideration as above
                     let buf = self
                         .pool
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index 88da7fd46..db5bb92c8 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -137,6 +137,10 @@ pub struct ConfigUniform {
     pub base_color: u32,
     /// Layout of packed scene data.
     pub layout: Layout,
+    /// Whether this stage has been cancelled at startup due to a predicted
+    ///
+    /// Will be set by the `prepare` stage, and so should always be 0 on CPU.
+    pub cancelled: u32,
     /// Size of line soup buffer allocation (in [`LineSoup`]s)
     pub lines_size: u32,
     /// Size of binning buffer allocation (in `u32`s).
@@ -178,6 +182,7 @@ impl RenderConfig {
         Self {
             gpu: ConfigUniform {
                 width_in_tiles,
+                cancelled: false.into(),
                 height_in_tiles,
                 target_width: width,
                 target_height: height,
diff --git a/vello_shaders/shader/backdrop.wgsl b/vello_shaders/shader/backdrop.wgsl
index acf97aa2f..78facafad 100644
--- a/vello_shaders/shader/backdrop.wgsl
+++ b/vello_shaders/shader/backdrop.wgsl
@@ -26,6 +26,9 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     let width_in_tiles = config.width_in_tiles;
     let ix = wg_id.x * width_in_tiles + local_id.x;
     var backdrop = 0;
diff --git a/vello_shaders/shader/bbox_clear.wgsl b/vello_shaders/shader/bbox_clear.wgsl
index 067d7c2dd..af1a0dc32 100644
--- a/vello_shaders/shader/bbox_clear.wgsl
+++ b/vello_shaders/shader/bbox_clear.wgsl
@@ -14,6 +14,9 @@ var<storage, read_write> path_bboxes: array<PathBbox>;
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     let ix = global_id.x;
     if ix < config.n_path {
         path_bboxes[ix].x0 = 0x7fffffff;
diff --git a/vello_shaders/shader/clip_leaf.wgsl b/vello_shaders/shader/clip_leaf.wgsl
index e947177d6..eb222a930 100644
--- a/vello_shaders/shader/clip_leaf.wgsl
+++ b/vello_shaders/shader/clip_leaf.wgsl
@@ -83,6 +83,9 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     var bic: Bic;
     if local_id.x < wg_id.x {
         bic = reduced[local_id.x];
diff --git a/vello_shaders/shader/clip_reduce.wgsl b/vello_shaders/shader/clip_reduce.wgsl
index bdbc0e989..4ec28b72b 100644
--- a/vello_shaders/shader/clip_reduce.wgsl
+++ b/vello_shaders/shader/clip_reduce.wgsl
@@ -27,6 +27,7 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    // TODO: Cancel if this entire run has been cancelled?
     let inp = clip_inp[global_id.x].path_ix;
     let is_push = inp >= 0;
     var bic = Bic(1u - u32(is_push), u32(is_push));
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index 6856396b4..ba0edfa8f 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -155,7 +155,7 @@ fn main(
     // We need to check only prior stages, as if this stage has failed in another workgroup, 
     // we still want to know this workgroup's memory requirement.   
     if local_id.x == 0u {
-        var failed = atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_FLATTEN);
+        var failed = atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_FLATTEN | PREVIOUS_RUN);
         if atomicLoad(&bump.seg_counts) > config.seg_counts_size {
             failed |= STAGE_PATH_COUNT;
         }
diff --git a/vello_shaders/shader/draw_leaf.wgsl b/vello_shaders/shader/draw_leaf.wgsl
index 2ef76db80..2b87798d3 100644
--- a/vello_shaders/shader/draw_leaf.wgsl
+++ b/vello_shaders/shader/draw_leaf.wgsl
@@ -54,6 +54,9 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     // Reduce prefix of workgroups up to this one
     var agg = draw_monoid_identity();
     if local_id.x < wg_id.x {
@@ -108,10 +111,7 @@ fn main(
         }
         let dd = config.drawdata_base + m.scene_offset;
         let di = m.info_offset;
-        if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT ||
-            tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT ||
-            tag_word == DRAWTAG_FILL_IMAGE || tag_word == DRAWTAG_BEGIN_CLIP
-        {
+        if tag_word == DRAWTAG_FILL_COLOR || tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE || tag_word == DRAWTAG_BEGIN_CLIP {
             let bbox = path_bbox[m.path_ix];
             // TODO: bbox is mostly yagni here, sort that out. Maybe clips?
             // let x0 = f32(bbox.x0);
@@ -121,9 +121,7 @@ fn main(
             // let bbox_f = vec4(x0, y0, x1, y1);
             var transform = Transform();
             let draw_flags = bbox.draw_flags;
-            if tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT ||
-                tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE
-            {
+            if tag_word == DRAWTAG_FILL_LIN_GRADIENT || tag_word == DRAWTAG_FILL_RAD_GRADIENT || tag_word == DRAWTAG_FILL_SWEEP_GRADIENT || tag_word == DRAWTAG_FILL_IMAGE {
                 transform = read_transform(config.transform_base, bbox.trans_ix);
             }
             switch tag_word {
diff --git a/vello_shaders/shader/draw_reduce.wgsl b/vello_shaders/shader/draw_reduce.wgsl
index 7a12b8188..5acaf6a18 100644
--- a/vello_shaders/shader/draw_reduce.wgsl
+++ b/vello_shaders/shader/draw_reduce.wgsl
@@ -24,6 +24,9 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     let num_blocks_total = (config.n_drawobj + (WG_SIZE - 1u)) / WG_SIZE;
     // When the number of blocks exceeds the workgroup size, divide
     // the work evenly so each workgroup handles n_blocks / wg, with
diff --git a/vello_shaders/shader/flatten.wgsl b/vello_shaders/shader/flatten.wgsl
index 1de07910f..a4de16d90 100644
--- a/vello_shaders/shader/flatten.wgsl
+++ b/vello_shaders/shader/flatten.wgsl
@@ -359,8 +359,7 @@ fn flatten_euler(
 
         transform = local_to_device;
         let mat = transform.mat;
-        scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) +
-                length(vec2(mat.x - mat.w, mat.y + mat.z));
+        scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) + length(vec2(mat.x - mat.w, mat.y + mat.z));
     }
 
     // Drop zero length lines. This is an exact equality test because dropping very short
@@ -811,6 +810,9 @@ fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     let ix = global_id.x;
     pathdata_base = config.pathdata_base;
     bbox = vec4(1e31, 1e31, -1e31, -1e31);
@@ -848,7 +850,7 @@ fn main(
                     let offset_tangent = offset * normalize(tangent);
                     let n = offset_tangent.yx * vec2f(-1., 1.);
                     draw_cap(path_ix, (style_flags & STYLE_FLAGS_START_CAP_MASK) >> 2u,
-                             pts.p0, pts.p0 - n, pts.p0 + n, -offset_tangent, transform);
+                        pts.p0, pts.p0 - n, pts.p0 + n, -offset_tangent, transform);
                 } else {
                     // Don't draw anything if the path is closed.
                 }
@@ -878,11 +880,11 @@ fn main(
 
                 if neighbor.do_join {
                     draw_join(path_ix, style_flags, pts.p3, tan_prev, tan_next,
-                              n_prev, n_next, transform);
+                        n_prev, n_next, transform);
                 } else {
                     // Draw end cap.
                     draw_cap(path_ix, (style_flags & STYLE_FLAGS_END_CAP_MASK),
-                             pts.p3, pts.p3 + n_prev, pts.p3 - n_prev, offset_tangent, transform);
+                        pts.p3, pts.p3 + n_prev, pts.p3 - n_prev, offset_tangent, transform);
                 }
             }
         } else {
diff --git a/vello_shaders/shader/path_count.wgsl b/vello_shaders/shader/path_count.wgsl
index 7de89278d..a9e7c6171 100644
--- a/vello_shaders/shader/path_count.wgsl
+++ b/vello_shaders/shader/path_count.wgsl
@@ -52,6 +52,7 @@ let ROBUST_EPSILON: f32 = 2e-7;
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
 ) {
+    // If the pipeline is cancelled, `path_count_setup` will not allocate any threads
     let n_lines = atomicLoad(&bump.lines);
     var count = 0u;
     if global_id.x < n_lines {
diff --git a/vello_shaders/shader/path_tiling.wgsl b/vello_shaders/shader/path_tiling.wgsl
index 63f982b87..eb659f8f3 100644
--- a/vello_shaders/shader/path_tiling.wgsl
+++ b/vello_shaders/shader/path_tiling.wgsl
@@ -40,6 +40,7 @@ let ROBUST_EPSILON: f32 = 2e-7;
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
 ) {
+    // If the pipeline is cancelled, `path_tiling_setup` will not allocate any threads
     let n_segments = atomicLoad(&bump.seg_counts);
     if global_id.x < n_segments {
         let seg_count = seg_counts[global_id.x];
diff --git a/vello_shaders/shader/pathtag_reduce.wgsl b/vello_shaders/shader/pathtag_reduce.wgsl
index ab6e10e98..f591e4d61 100644
--- a/vello_shaders/shader/pathtag_reduce.wgsl
+++ b/vello_shaders/shader/pathtag_reduce.wgsl
@@ -23,6 +23,9 @@ fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
     let ix = global_id.x;
     let tag_word = scene[config.pathtag_base + ix];
     var agg = reduce_tag(tag_word);
diff --git a/vello_shaders/shader/pathtag_reduce2.wgsl b/vello_shaders/shader/pathtag_reduce2.wgsl
index eb8621f0f..f2ce907de 100644
--- a/vello_shaders/shader/pathtag_reduce2.wgsl
+++ b/vello_shaders/shader/pathtag_reduce2.wgsl
@@ -23,6 +23,7 @@ fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
 ) {
+    // TODO: Cancel if needed?
     let ix = global_id.x;
     var agg = reduced_in[ix];
     sh_scratch[local_id.x] = agg;
diff --git a/vello_shaders/shader/pathtag_scan.wgsl b/vello_shaders/shader/pathtag_scan.wgsl
index 27a34bdf7..b460afc3e 100644
--- a/vello_shaders/shader/pathtag_scan.wgsl
+++ b/vello_shaders/shader/pathtag_scan.wgsl
@@ -31,6 +31,9 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    if config.cancelled != 0u {
+        return;
+    }
 #ifdef small
     var agg = tag_monoid_identity();
     if local_id.x < wg_id.x {
diff --git a/vello_shaders/shader/pathtag_scan1.wgsl b/vello_shaders/shader/pathtag_scan1.wgsl
index 7f3b47659..10610f7c8 100644
--- a/vello_shaders/shader/pathtag_scan1.wgsl
+++ b/vello_shaders/shader/pathtag_scan1.wgsl
@@ -29,6 +29,7 @@ fn main(
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
+    // TODO: Cancel if needed
     var agg = tag_monoid_identity();
     if local_id.x < wg_id.x {
         agg = reduced2[local_id.x];
diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl
new file mode 100644
index 000000000..ed6cdca39
--- /dev/null
+++ b/vello_shaders/shader/prepare.wgsl
@@ -0,0 +1,70 @@
+// Copyright 2022 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Determine whether the Vello pipeline is likely to fail during this run
+// and therefore whether all later stages should be cancelled.
+
+#import config
+#import bump
+
+@group(0) @binding(0)
+var<storage, read_write> config: Config;
+
+@group(0) @binding(1)
+// TODO: Use a non-atomic version of BumpAllocators?
+var<storage, read_write> bump: BumpAllocators;
+
+@compute @workgroup_size(1)
+fn main() {
+    var should_cancel = false;
+    let previous_failure = atomicLoad(&bump.failed);
+    if previous_failure == PREVIOUS_RUN {
+        // Don't early-exit from multiple frames in a row
+        // The CPU should be blocking on the frame which failed anyway, so this
+        // case should never be reached, but if the CPU side isn't doing that
+        // properly, we can try again.
+        // (Note that this check is simply an early-exit for this case, as all the 
+        // bump values would have been reset to 0 anyway)
+        atomicStore(&bump.failed, 0u);
+    } else if previous_failure != 0u {
+        // If the previous frame failed (for another reason)
+
+        // And we don't have enough memory to have run that previous frame
+        if config.lines_size < atomicLoad(&bump.lines) {
+            should_cancel = true;
+        }
+        if config.binning_size < atomicLoad(&bump.binning) {
+            should_cancel = true;
+        }
+        if config.ptcl_size < atomicLoad(&bump.ptcl) {
+            should_cancel = true;
+        }
+        if config.tiles_size < atomicLoad(&bump.tile) {
+            should_cancel = true;
+        }
+        if config.seg_counts_size < atomicLoad(&bump.seg_counts) {
+            should_cancel = true;
+        }
+        if config.segments_size < atomicLoad(&bump.segments) {
+            should_cancel = true;
+        }
+        if config.lines_size < atomicLoad(&bump.lines) {
+            should_cancel = true;
+        }
+        // config.blend_size < atomicLoad(&bump.blend)
+        if should_cancel {
+            // Then don't run this frame
+            config.cancelled = 1u;
+            atomicStore(&bump.failed, PREVIOUS_RUN);
+        } else {
+            atomicStore(&bump.failed, 0u);
+        }
+    }
+    atomicStore(&bump.binning, 0u);
+    atomicStore(&bump.ptcl, 0u);
+    atomicStore(&bump.tile, 0u);
+    atomicStore(&bump.seg_counts, 0u);
+    atomicStore(&bump.segments, 0u);
+    atomicStore(&bump.blend, 0u);
+    atomicStore(&bump.lines, 0u);
+}
diff --git a/vello_shaders/shader/shared/bump.wgsl b/vello_shaders/shader/shared/bump.wgsl
index 9270fc2f8..54763d94f 100644
--- a/vello_shaders/shader/shared/bump.wgsl
+++ b/vello_shaders/shader/shared/bump.wgsl
@@ -7,6 +7,7 @@ let STAGE_TILE_ALLOC: u32 = 0x2u;
 let STAGE_FLATTEN: u32 = 0x4u;
 let STAGE_PATH_COUNT: u32 = 0x8u;
 let STAGE_COARSE: u32 = 0x10u;
+let PREVIOUS_RUN: u32 = 0x20u;
 
 // This must be kept in sync with the struct in config.rs in the encoding crate.
 struct BumpAllocators {
diff --git a/vello_shaders/shader/shared/config.wgsl b/vello_shaders/shader/shared/config.wgsl
index 3391afd9a..95d68cc7c 100644
--- a/vello_shaders/shader/shared/config.wgsl
+++ b/vello_shaders/shader/shared/config.wgsl
@@ -32,6 +32,9 @@ struct Config {
     transform_base: u32,
     style_base: u32,
 
+    // Whether this stage has been cancelled at startup. 0 means uncancelled.
+    cancelled: u32,
+
     // Sizes of bump allocated buffers (in element size units)
     lines_size: u32,
     binning_size: u32,
diff --git a/vello_shaders/shader/tile_alloc.wgsl b/vello_shaders/shader/tile_alloc.wgsl
index c6073d128..2f32f8668 100644
--- a/vello_shaders/shader/tile_alloc.wgsl
+++ b/vello_shaders/shader/tile_alloc.wgsl
@@ -41,7 +41,7 @@ fn main(
     // We need to check only prior stages, as if this stage has failed in another workgroup, 
     // we still want to know this workgroup's memory requirement.
     if local_id.x == 0u {
-        let failed = (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_FLATTEN)) != 0u;
+        let failed = (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_FLATTEN | PREVIOUS_RUN)) != 0u;
         sh_previous_failed = u32(failed);
     }
     let failed = workgroupUniformLoad(&sh_previous_failed);

From 4309b593f42e5852672e0682fd34bf1a4510f574 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Thu, 6 Jun 2024 11:56:23 +0100
Subject: [PATCH 03/18] Give slightly better error output

`rust-analyzer` will eagerly eat the error output otherwise
---
 vello_shaders/build.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vello_shaders/build.rs b/vello_shaders/build.rs
index 79abde655..57930b26a 100644
--- a/vello_shaders/build.rs
+++ b/vello_shaders/build.rs
@@ -24,7 +24,10 @@ fn main() {
     let mut shaders = match compile::ShaderInfo::from_default() {
         Ok(s) => s,
         Err(err) => {
-            eprintln!("{err}");
+            let formatted = err.to_string();
+            for line in formatted.lines() {
+                println!("cargo:warning={line}");
+            }
             return;
         }
     };

From 123f60d5f710049a975032169b9f3c32ec2d8c2d Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Fri, 7 Jun 2024 15:24:07 +0100
Subject: [PATCH 04/18] Fix CPU handling and remove empty file

---
 vello/src/lib.rs                  | 41 +++++++++++++++++++++++++------
 vello/src/render.rs               |  8 +++---
 vello/src/robust.rs               | 12 ---------
 vello/src/shaders.rs              |  2 +-
 vello_shaders/shader/prepare.wgsl |  1 +
 vello_shaders/src/cpu.rs          |  2 ++
 vello_shaders/src/cpu/prepare.rs  | 23 +++++++++++++++++
 7 files changed, 65 insertions(+), 24 deletions(-)
 delete mode 100644 vello/src/robust.rs
 create mode 100644 vello_shaders/src/cpu/prepare.rs

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 85affba29..6d57a6de0 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -84,7 +84,6 @@
 mod debug;
 mod recording;
 mod render;
-mod robust;
 mod scene;
 mod shaders;
 #[cfg(feature = "wgpu")]
@@ -123,6 +122,7 @@ pub use shaders::FullShaders;
 
 #[cfg(feature = "wgpu")]
 use vello_encoding::Resolver;
+use wgpu::{Buffer, BufferUsages};
 #[cfg(feature = "wgpu")]
 use wgpu_engine::{ExternalResource, WgpuEngine};
 
@@ -252,6 +252,7 @@ pub struct Renderer {
     blit: Option<BlitPipeline>,
     #[cfg(feature = "debug_layers")]
     debug: Option<debug::DebugRenderer>,
+    bump: Option<Buffer>,
     target: Option<TargetTexture>,
     #[cfg(feature = "wgpu-profiler")]
     pub profiler: GpuProfiler,
@@ -351,6 +352,7 @@ impl Renderer {
             #[cfg(feature = "debug_layers")]
             debug,
             target: None,
+            bump: None,
             // Use 3 pending frames
             #[cfg(feature = "wgpu-profiler")]
             profiler: GpuProfiler::new(GpuProfilerSettings {
@@ -388,17 +390,42 @@ impl Renderer {
         texture: &TextureView,
         params: &RenderParams,
     ) -> Result<()> {
-        let (recording, target) =
+        let (mut recording, target, bump_buf) =
             render::render_full(scene, &mut self.resolver, &self.shaders, params);
-        let external_resources = [ExternalResource::Image(
-            *target.as_image().unwrap(),
-            texture,
-        )];
+        let cpu_external;
+        let gpu_external;
+        let gpu_bump;
+        let external_resources: &[ExternalResource] = if self.options.use_cpu {
+            // HACK: Our handling of buffers across CPU and GPU is not great
+
+            // We don't retrain the bump buffer if we're using CPU shaders
+            // This is because some of stages might still be running on the
+            // GPU, and we can't easily get the bump buffer back to the CPU
+            // from the GPU
+            recording.free_buffer(bump_buf);
+            cpu_external = [ExternalResource::Image(target, texture)];
+            &cpu_external
+        } else {
+            gpu_bump = self.bump.get_or_insert_with(|| {
+                device.create_buffer(&wgpu::BufferDescriptor {
+                    label: Some("bump"),
+                    size: bump_buf.size,
+                    usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC,
+                    mapped_at_creation: false,
+                })
+            });
+            gpu_external = [
+                ExternalResource::Image(target, texture),
+                ExternalResource::Buffer(bump_buf, &gpu_bump),
+            ];
+            &gpu_external
+        };
+
         self.engine.run_recording(
             device,
             queue,
             &recording,
-            &external_resources,
+            external_resources,
             "render_to_texture",
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
diff --git a/vello/src/render.rs b/vello/src/render.rs
index d6d2280dd..1ee235de8 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -82,7 +82,7 @@ pub(crate) fn render_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
-) -> (Recording, ResourceProxy) {
+) -> (Recording, ImageProxy, BufferProxy) {
     render_encoding_full(scene.encoding(), resolver, shaders, params)
 }
 
@@ -96,12 +96,13 @@ pub(crate) fn render_encoding_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
-) -> (Recording, ResourceProxy) {
+) -> (Recording, ImageProxy, BufferProxy) {
     let mut render = Render::new();
     let mut recording = render.render_encoding_coarse(encoding, resolver, shaders, params, false);
     let out_image = render.out_image();
+    let bump_buf = render.bump_buf();
     render.record_fine(shaders, &mut recording);
-    (recording, out_image.into())
+    (recording, out_image, bump_buf)
 }
 
 impl Default for Render {
@@ -196,7 +197,6 @@ impl Render {
             "reduced_buf",
         );
         let bump_buf = BufferProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf");
-        recording.clear_all(bump_buf);
         let bump_buf = ResourceProxy::Buffer(bump_buf);
         recording.dispatch(shaders.prepare, (1, 1, 1), [config_buf, bump_buf]);
         // TODO: really only need pathtag_wgs - 1
diff --git a/vello/src/robust.rs b/vello/src/robust.rs
deleted file mode 100644
index 05ed18b30..000000000
--- a/vello/src/robust.rs
+++ /dev/null
@@ -1,12 +0,0 @@
-//! A discussion of Vello's robust dynamic memory support
-//!
-//! When running the Vello pipeline, there are several buffers which:
-//! 1) Need to be large enough to store
-//! 2) Have a size which is non-trivial to calculate before running the pipeline
-//!
-//! When using wgpu (and most GPU apis), it is not possible for the GPU to synchronously
-//! request a larger buffer, so we have to provide a best-effort buffer for this purpose.
-//!
-//! ## Handling failures
-//!
-//! If the buffer which was provided was too small, we have an issue.
diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs
index e0550b5a2..1124888ef 100644
--- a/vello/src/shaders.rs
+++ b/vello/src/shaders.rs
@@ -102,7 +102,7 @@ pub(crate) fn full_shaders(
         };
     }
 
-    let prepare = add_shader!(prepare, [Buffer, Buffer], CpuShaderType::Skipped);
+    let prepare = add_shader!(prepare, [Buffer, Buffer]);
     let pathtag_reduce = add_shader!(pathtag_reduce, [Uniform, BufReadOnly, Buffer]);
     let pathtag_reduce2 = add_shader!(
         pathtag_reduce2,
diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl
index ed6cdca39..93ed8c447 100644
--- a/vello_shaders/shader/prepare.wgsl
+++ b/vello_shaders/shader/prepare.wgsl
@@ -3,6 +3,7 @@
 
 // Determine whether the Vello pipeline is likely to fail during this run
 // and therefore whether all later stages should be cancelled.
+// This enables reduced latency in most failure cases
 
 #import config
 #import bump
diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs
index a129fce59..2266b283c 100644
--- a/vello_shaders/src/cpu.rs
+++ b/vello_shaders/src/cpu.rs
@@ -29,6 +29,7 @@ mod pathtag_reduce;
 mod pathtag_scan;
 mod tile_alloc;
 mod util;
+mod prepare;
 
 pub use backdrop::backdrop;
 pub use bbox_clear::bbox_clear;
@@ -46,6 +47,7 @@ pub use path_tiling_setup::path_tiling_setup;
 pub use pathtag_reduce::pathtag_reduce;
 pub use pathtag_scan::pathtag_scan;
 pub use tile_alloc::tile_alloc;
+pub use prepare::prepare;
 
 use std::cell::{Ref, RefCell, RefMut};
 use std::ops::{Deref, DerefMut};
diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs
new file mode 100644
index 000000000..81750b0ac
--- /dev/null
+++ b/vello_shaders/src/cpu/prepare.rs
@@ -0,0 +1,23 @@
+use vello_encoding::{BumpAllocators, ConfigUniform};
+
+use super::CpuBinding;
+
+fn prepare_main(_config: &ConfigUniform, bump: &mut BumpAllocators) {
+    // We don't yet do robust bump handling in the CPU shaders, so do the minimal version for this shader
+    bump.binning = 0;
+    bump.ptcl = 0;
+    bump.tile = 0;
+    bump.seg_counts = 0;
+    bump.segments = 0;
+    bump.blend = 0;
+    bump.lines = 0;
+    bump.failed = 0;
+}
+
+pub fn prepare(_n_wg: u32, resources: &[CpuBinding]) {
+    // On the GPU, config is mutable, but our CPU runner doesn't allow accessing uploaded buffers as mutable
+    // This is a hack
+    let config = resources[0].as_typed();
+    let mut bump = resources[1].as_typed_mut();
+    prepare_main(&config, &mut bump);
+}

From dd506d569919f7f84b82eda54142398467ee5b6a Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Fri, 7 Jun 2024 17:40:42 +0100
Subject: [PATCH 05/18] Get the hackiest version of robust memory working

---
 vello/src/lib.rs             | 123 ++++++++++++++++++++++++++++++++---
 vello/src/render.rs          |  48 ++++++++++----
 vello/src/wgpu_engine.rs     |  10 +++
 vello_encoding/src/config.rs |  97 +++++++++++++++++----------
 vello_encoding/src/lib.rs    |   4 +-
 vello_shaders/src/cpu.rs     |   4 +-
 6 files changed, 223 insertions(+), 63 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 6d57a6de0..d9a0c0c06 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -89,6 +89,10 @@ mod shaders;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc,
+};
 #[cfg(feature = "wgpu")]
 use std::{num::NonZeroUsize, sync::Arc};
 
@@ -102,6 +106,7 @@ pub use skrifa;
 
 pub mod glyph;
 
+use vello_encoding::{BufferSize, BumpBufferSizes};
 #[cfg(feature = "wgpu")]
 pub use wgpu;
 
@@ -122,7 +127,7 @@ pub use shaders::FullShaders;
 
 #[cfg(feature = "wgpu")]
 use vello_encoding::Resolver;
-use wgpu::{Buffer, BufferUsages};
+use wgpu::SubmissionIndex;
 #[cfg(feature = "wgpu")]
 use wgpu_engine::{ExternalResource, WgpuEngine};
 
@@ -130,7 +135,7 @@ pub use debug::DebugLayers;
 /// Temporary export, used in `with_winit` for stats
 pub use vello_encoding::BumpAllocators;
 #[cfg(feature = "wgpu")]
-use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
+use wgpu::{Buffer, BufferUsages, Device, Queue, SurfaceTexture, TextureFormat, TextureView};
 #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))]
 use wgpu_profiler::{GpuProfiler, GpuProfilerSettings};
 
@@ -241,6 +246,8 @@ pub enum Error {
 #[allow(dead_code)] // this can be unused when wgpu feature is not used
 pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
 
+type BumpSubmission = (SubmissionIndex, Buffer, Arc<AtomicBool>);
+
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
 pub struct Renderer {
@@ -252,8 +259,12 @@ pub struct Renderer {
     blit: Option<BlitPipeline>,
     #[cfg(feature = "debug_layers")]
     debug: Option<debug::DebugRenderer>,
-    bump: Option<Buffer>,
     target: Option<TargetTexture>,
+    // Fields for robust dynamic memory
+    bump: Option<Buffer>,
+    previous_submission: Option<BumpSubmission>,
+    previouser_submission: Option<BumpSubmission>,
+    bump_sizes: BumpBufferSizes,
     #[cfg(feature = "wgpu-profiler")]
     pub profiler: GpuProfiler,
     #[cfg(feature = "wgpu-profiler")]
@@ -353,6 +364,9 @@ impl Renderer {
             debug,
             target: None,
             bump: None,
+            previous_submission: None,
+            previouser_submission: None,
+            bump_sizes: Default::default(),
             // Use 3 pending frames
             #[cfg(feature = "wgpu-profiler")]
             profiler: GpuProfiler::new(GpuProfilerSettings {
@@ -389,9 +403,14 @@ impl Renderer {
         scene: &Scene,
         texture: &TextureView,
         params: &RenderParams,
-    ) -> Result<()> {
-        let (mut recording, target, bump_buf) =
-            render::render_full(scene, &mut self.resolver, &self.shaders, params);
+    ) -> Result<Option<Buffer>> {
+        let (mut recording, target, bump_buf) = render::render_full(
+            scene,
+            &mut self.resolver,
+            &self.shaders,
+            params,
+            self.bump_sizes,
+        );
         let cpu_external;
         let gpu_external;
         let gpu_bump;
@@ -416,7 +435,7 @@ impl Renderer {
             });
             gpu_external = [
                 ExternalResource::Image(target, texture),
-                ExternalResource::Buffer(bump_buf, &gpu_bump),
+                ExternalResource::Buffer(bump_buf, gpu_bump),
             ];
             &gpu_external
         };
@@ -430,7 +449,8 @@ impl Renderer {
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
         )?;
-        Ok(())
+        let bump_download = self.engine.take_download(bump_buf);
+        Ok(bump_download)
     }
 
     /// Renders a scene to the target surface.
@@ -449,6 +469,67 @@ impl Renderer {
         surface: &SurfaceTexture,
         params: &RenderParams,
     ) -> Result<()> {
+        let buffer_completed = if let Some((idx, bump, completed)) =
+            self.previouser_submission.take()
+        {
+            // Ensure that we have the bump buffer from the rendering two frames ago
+            // The previous frame will have been cancelled if that is the case
+
+            // Warning: Blocks!
+            device.poll(wgpu::MaintainBase::WaitForSubmissionIndex(idx));
+
+            if completed.swap(false, Ordering::Acquire) {
+                {
+                    let slice = &bump.slice(..);
+                    let data = slice.get_mapped_range();
+                    let data: BumpAllocators = bytemuck::pod_read_unaligned(&*data);
+                    if data.failed != 0 {
+                        if data.failed == 0x20 {
+                            eprintln!(
+                                "Run failed but next run will be retried, reallocated in last run"
+                            );
+                        } else {
+                            eprintln!("Previous run failed, need to reallocate");
+                            // TODO: Be smarter here, e.g. notice that we're over by a certain factor
+                            // and bump several buffers?
+
+                            // TODO: Also reduce allocation sizes
+                            // TODO: Free buffers which haven't been used in "a while"
+
+                            // TODO: This ignore the draw tag length?
+                            if data.binning > self.bump_sizes.bin_data.len() {
+                                self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2);
+                            }
+                            if data.lines > self.bump_sizes.lines.len() {
+                                self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4);
+                            }
+                            // if data.blend > self.bump_sizes.? // TODO
+                            if data.ptcl > self.bump_sizes.ptcl.len() {
+                                self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 5 / 4);
+                            }
+                            if data.seg_counts > self.bump_sizes.seg_counts.len() {
+                                self.bump_sizes.seg_counts =
+                                    BufferSize::new(data.seg_counts * 5 / 4);
+                            }
+                            if data.tile > self.bump_sizes.tiles.len() {
+                                self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4);
+                            }
+                            if data.segments > self.bump_sizes.segments.len() {
+                                self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4);
+                            }
+                        }
+                    }
+                }
+                bump.unmap();
+                // TODO: Return `bump` into the engine's pool
+            } else {
+                // Downloading the buffer failed; we just assume that we can keep going?
+            }
+            completed
+        } else {
+            Arc::new(AtomicBool::new(false))
+        };
+
         let width = params.width;
         let height = params.height;
         let mut target = self
@@ -460,7 +541,7 @@ impl Renderer {
         if target.width != width || target.height != height {
             target = TargetTexture::new(device, width, height);
         }
-        self.render_to_texture(device, queue, scene, &target.view, params)?;
+        let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?;
         let blit = self
             .blit
             .as_ref()
@@ -497,7 +578,28 @@ impl Renderer {
             "blit (render_to_surface)",
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
-        )?;
+        );
+        if let Some(download) = &bump_download {
+            let completed = buffer_completed.clone();
+            download
+                .slice(..)
+                .map_async(wgpu::MapMode::Read, move |res| match res {
+                    Ok(()) => {
+                        completed.store(true, Ordering::Release);
+                    }
+                    Err(e) => {
+                        log::warn!("Failed to map bump buffer: {e}");
+                    }
+                });
+        };
+        let idx = queue.submit(Some(encoder.finish()));
+        if let Some(download) = bump_download {
+            self.previouser_submission =
+                self.previous_submission
+                    .replace((idx, download, buffer_completed));
+        } else {
+            self.previouser_submission = self.previous_submission.take();
+        }
         self.target = Some(target);
         #[cfg(feature = "wgpu-profiler")]
         {
@@ -606,6 +708,7 @@ impl Renderer {
             &mut self.resolver,
             &self.shaders,
             params,
+            Default::default(),
             robust,
         );
         let target = render.out_image();
diff --git a/vello/src/render.rs b/vello/src/render.rs
index 1ee235de8..4918946ec 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -13,7 +13,9 @@ use crate::{AaConfig, RenderParams};
 #[cfg(feature = "wgpu")]
 use crate::Scene;
 
-use vello_encoding::{make_mask_lut, make_mask_lut_16, Encoding, Resolver, WorkgroupSize};
+use vello_encoding::{
+    make_mask_lut, make_mask_lut_16, BumpBufferSizes, Encoding, Resolver, WorkgroupSize,
+};
 
 /// State for a render in progress.
 pub struct Render {
@@ -82,8 +84,9 @@ pub(crate) fn render_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
+    bump_sizes: BumpBufferSizes,
 ) -> (Recording, ImageProxy, BufferProxy) {
-    render_encoding_full(scene.encoding(), resolver, shaders, params)
+    render_encoding_full(scene.encoding(), resolver, shaders, params, bump_sizes)
 }
 
 #[cfg(feature = "wgpu")]
@@ -96,9 +99,11 @@ pub(crate) fn render_encoding_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
+    bump_sizes: BumpBufferSizes,
 ) -> (Recording, ImageProxy, BufferProxy) {
     let mut render = Render::new();
-    let mut recording = render.render_encoding_coarse(encoding, resolver, shaders, params, false);
+    let mut recording =
+        render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, true);
     let out_image = render.out_image();
     let bump_buf = render.bump_buf();
     render.record_fine(shaders, &mut recording);
@@ -132,6 +137,7 @@ impl Render {
         resolver: &mut Resolver,
         shaders: &FullShaders,
         params: &RenderParams,
+        bump_sizes: BumpBufferSizes,
         robust: bool,
     ) -> Recording {
         use vello_encoding::RenderConfig;
@@ -167,8 +173,13 @@ impl Render {
         for image in images.images {
             recording.write_image(image_atlas, image.1, image.2, image.0.clone());
         }
-        let cpu_config =
-            RenderConfig::new(&layout, params.width, params.height, &params.base_color);
+        let cpu_config = RenderConfig::new(
+            &layout,
+            params.width,
+            params.height,
+            &params.base_color,
+            bump_sizes,
+        );
         let buffer_sizes = &cpu_config.buffer_sizes;
         let wg_counts = &cpu_config.workgroup_counts;
 
@@ -184,14 +195,21 @@ impl Render {
             recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)),
         );
         let info_bin_data_buf = ResourceProxy::new_buf(
-            buffer_sizes.bin_data.size_in_bytes() as u64,
+            buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64,
             "info_bin_data_buf",
         );
-        let tile_buf =
-            ResourceProxy::new_buf(buffer_sizes.tiles.size_in_bytes().into(), "tile_buf");
-        let segments_buf =
-            ResourceProxy::new_buf(buffer_sizes.segments.size_in_bytes().into(), "segments_buf");
-        let ptcl_buf = ResourceProxy::new_buf(buffer_sizes.ptcl.size_in_bytes().into(), "ptcl_buf");
+        let tile_buf = ResourceProxy::new_buf(
+            buffer_sizes.bump_buffers.tiles.size_in_bytes().into(),
+            "tile_buf",
+        );
+        let segments_buf = ResourceProxy::new_buf(
+            buffer_sizes.bump_buffers.segments.size_in_bytes().into(),
+            "segments_buf",
+        );
+        let ptcl_buf = ResourceProxy::new_buf(
+            buffer_sizes.bump_buffers.ptcl.size_in_bytes().into(),
+            "ptcl_buf",
+        );
         let reduced_buf = ResourceProxy::new_buf(
             buffer_sizes.path_reduced.size_in_bytes().into(),
             "reduced_buf",
@@ -259,8 +277,10 @@ impl Render {
             wg_counts.bbox_clear,
             [config_buf, path_bbox_buf],
         );
-        let lines_buf =
-            ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf");
+        let lines_buf = ResourceProxy::new_buf(
+            buffer_sizes.bump_buffers.lines.size_in_bytes().into(),
+            "lines_buf",
+        );
         recording.dispatch(
             shaders.flatten,
             wg_counts.flatten,
@@ -391,7 +411,7 @@ impl Render {
             [bump_buf, indirect_count_buf.into()],
         );
         let seg_counts_buf = ResourceProxy::new_buf(
-            buffer_sizes.seg_counts.size_in_bytes().into(),
+            buffer_sizes.bump_buffers.seg_counts.size_in_bytes().into(),
             "seg_counts_buf",
         );
         recording.dispatch_indirect(
diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs
index fa3e80b3d..adb2ec8ed 100644
--- a/vello/src/wgpu_engine.rs
+++ b/vello/src/wgpu_engine.rs
@@ -685,6 +685,12 @@ impl WgpuEngine {
                     let src_buf = self
                         .bind_map
                         .get_gpu_buf(proxy.id)
+                        .or_else(|| {
+                            transient_map.bufs.get(&proxy.id).and_then(|it| match it {
+                                TransientBuf::Cpu(_) => None,
+                                TransientBuf::Gpu(buf) => Some(*buf),
+                            })
+                        })
                         .ok_or(Error::UnavailableBufferUsed(proxy.name, "download"))?;
                     let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST;
                     let buf = self.pool.get_buf(proxy.size, "download", usage, device);
@@ -748,6 +754,10 @@ impl WgpuEngine {
         self.downloads.get(&buf.id)
     }
 
+    pub fn take_download(&mut self, buf: BufferProxy) -> Option<Buffer> {
+        self.downloads.remove(&buf.id)
+    }
+
     pub fn free_download(&mut self, buf: BufferProxy) {
         self.downloads.remove(&buf.id);
     }
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index db5bb92c8..24e0aae3c 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -170,7 +170,13 @@ pub struct RenderConfig {
 }
 
 impl RenderConfig {
-    pub fn new(layout: &Layout, width: u32, height: u32, base_color: &peniko::Color) -> Self {
+    pub fn new(
+        layout: &Layout,
+        width: u32,
+        height: u32,
+        base_color: &peniko::Color,
+        bump_buffers: BumpBufferSizes,
+    ) -> Self {
         let new_width = width.next_multiple_of(TILE_WIDTH);
         let new_height = height.next_multiple_of(TILE_HEIGHT);
         let width_in_tiles = new_width / TILE_WIDTH;
@@ -178,7 +184,7 @@ impl RenderConfig {
         let n_path_tags = layout.path_tags_size();
         let workgroup_counts =
             WorkgroupCounts::new(layout, width_in_tiles, height_in_tiles, n_path_tags);
-        let buffer_sizes = BufferSizes::new(layout, &workgroup_counts);
+        let buffer_sizes = BufferSizes::new(layout, &workgroup_counts, bump_buffers);
         Self {
             gpu: ConfigUniform {
                 width_in_tiles,
@@ -187,13 +193,13 @@ impl RenderConfig {
                 target_width: width,
                 target_height: height,
                 base_color: base_color.to_premul_u32(),
-                lines_size: buffer_sizes.lines.len(),
-                binning_size: buffer_sizes.bin_data.len() - layout.bin_data_start,
-                tiles_size: buffer_sizes.tiles.len(),
-                seg_counts_size: buffer_sizes.seg_counts.len(),
-                segments_size: buffer_sizes.segments.len(),
-                blend_size: buffer_sizes.blend_spill.len(),
-                ptcl_size: buffer_sizes.ptcl.len(),
+                lines_size: buffer_sizes.bump_buffers.lines.len(),
+                binning_size: buffer_sizes.bump_buffers.bin_data.len() - layout.bin_data_start,
+                tiles_size: buffer_sizes.bump_buffers.tiles.len(),
+                seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(),
+                segments_size: buffer_sizes.bump_buffers.segments.len(),
+                blend_size: buffer_sizes.bump_buffers.blend_spill.len(),
+                ptcl_size: buffer_sizes.bump_buffers.ptcl.len(),
                 layout: *layout,
             },
             workgroup_counts,
@@ -333,6 +339,16 @@ impl<T: Sized> PartialOrd for BufferSize<T> {
         self.len.partial_cmp(&other.len)
     }
 }
+#[derive(Copy, Clone, Debug)]
+pub struct BumpBufferSizes {
+    pub lines: BufferSize<LineSoup>,
+    pub bin_data: BufferSize<u32>,
+    pub tiles: BufferSize<Tile>,
+    pub seg_counts: BufferSize<SegmentCount>,
+    pub segments: BufferSize<PathSegment>,
+    pub ptcl: BufferSize<u32>,
+    pub blend_spill: BufferSize<u32>,
+}
 
 /// Computed sizes for all buffers.
 #[derive(Copy, Clone, Debug, Default)]
@@ -356,17 +372,46 @@ pub struct BufferSizes {
     pub bin_headers: BufferSize<BinHeader>,
     pub paths: BufferSize<Path>,
     // Bump allocated buffers
-    pub lines: BufferSize<LineSoup>,
-    pub bin_data: BufferSize<u32>,
-    pub tiles: BufferSize<Tile>,
-    pub seg_counts: BufferSize<SegmentCount>,
-    pub segments: BufferSize<PathSegment>,
-    pub blend_spill: BufferSize<u32>,
-    pub ptcl: BufferSize<u32>,
+    pub bump_buffers: BumpBufferSizes,
+}
+
+impl BumpBufferSizes {
+    fn new() -> Self {
+        // The following buffer sizes have been hand picked to accommodate the vello test scenes as
+        // well as paris-30k. These should instead get derived from the scene layout using
+        // reasonable heuristics.
+        let bin_data = BufferSize::new(1 << 18);
+        let tiles = BufferSize::new(1 << 21);
+        let lines = BufferSize::new(1 << 21);
+        let seg_counts = BufferSize::new(1 << 21);
+        let segments = BufferSize::new(1 << 21);
+        let ptcl = BufferSize::new(1 << 23);
+        // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills.
+        let blend_spill = BufferSize::new(1 << 20);
+        BumpBufferSizes {
+            bin_data,
+            lines,
+            ptcl,
+            seg_counts,
+            segments,
+            tiles,
+            blend_spill,
+        }
+    }
+}
+
+impl Default for BumpBufferSizes {
+    fn default() -> Self {
+        Self::new()
+    }
 }
 
 impl BufferSizes {
-    pub fn new(layout: &Layout, workgroups: &WorkgroupCounts) -> Self {
+    pub fn new(
+        layout: &Layout,
+        workgroups: &WorkgroupCounts,
+        bump_buffers: BumpBufferSizes,
+    ) -> Self {
         let n_paths = layout.n_paths;
         let n_draw_objects = layout.n_draw_objects;
         let n_clips = layout.n_clips;
@@ -397,17 +442,6 @@ impl BufferSizes {
         let n_paths_aligned = align_up(n_paths, 256);
         let paths = BufferSize::new(n_paths_aligned);
 
-        // The following buffer sizes have been hand picked to accommodate the vello test scenes as
-        // well as paris-30k. These should instead get derived from the scene layout using
-        // reasonable heuristics.
-        let bin_data = BufferSize::new(1 << 18);
-        let tiles = BufferSize::new(1 << 21);
-        let lines = BufferSize::new(1 << 21);
-        let seg_counts = BufferSize::new(1 << 21);
-        let segments = BufferSize::new(1 << 21);
-        // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills.
-        let blend_spill = BufferSize::new(1 << 20);
-        let ptcl = BufferSize::new(1 << 23);
         Self {
             path_reduced,
             path_reduced2,
@@ -424,15 +458,8 @@ impl BufferSizes {
             draw_bboxes,
             bump_alloc,
             indirect_count,
-            lines,
             bin_headers,
             paths,
-            bin_data,
-            tiles,
-            seg_counts,
-            segments,
-            blend_spill,
-            ptcl,
         }
     }
 }
diff --git a/vello_encoding/src/lib.rs b/vello_encoding/src/lib.rs
index 30db95000..8e1109308 100644
--- a/vello_encoding/src/lib.rs
+++ b/vello_encoding/src/lib.rs
@@ -27,8 +27,8 @@ mod resolve;
 pub use binning::BinHeader;
 pub use clip::{Clip, ClipBbox, ClipBic, ClipElement};
 pub use config::{
-    BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, ConfigUniform, IndirectCount,
-    RenderConfig, WorkgroupCounts, WorkgroupSize,
+    BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, BumpBufferSizes, ConfigUniform,
+    IndirectCount, RenderConfig, WorkgroupCounts, WorkgroupSize,
 };
 pub use draw::{
     DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid,
diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs
index 2266b283c..30ca367f3 100644
--- a/vello_shaders/src/cpu.rs
+++ b/vello_shaders/src/cpu.rs
@@ -27,9 +27,9 @@ mod path_tiling;
 mod path_tiling_setup;
 mod pathtag_reduce;
 mod pathtag_scan;
+mod prepare;
 mod tile_alloc;
 mod util;
-mod prepare;
 
 pub use backdrop::backdrop;
 pub use bbox_clear::bbox_clear;
@@ -46,8 +46,8 @@ pub use path_tiling::path_tiling;
 pub use path_tiling_setup::path_tiling_setup;
 pub use pathtag_reduce::pathtag_reduce;
 pub use pathtag_scan::pathtag_scan;
-pub use tile_alloc::tile_alloc;
 pub use prepare::prepare;
+pub use tile_alloc::tile_alloc;
 
 use std::cell::{Ref, RefCell, RefMut};
 use std::ops::{Deref, DerefMut};

From f07391de6a52c99b97dc390fdf8e42f63c37b1d8 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Fri, 7 Jun 2024 17:43:27 +0100
Subject: [PATCH 06/18] Fix CPU runner and clippy

---
 vello/src/lib.rs    | 2 +-
 vello/src/render.rs | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index d9a0c0c06..9e6097100 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -482,7 +482,7 @@ impl Renderer {
                 {
                     let slice = &bump.slice(..);
                     let data = slice.get_mapped_range();
-                    let data: BumpAllocators = bytemuck::pod_read_unaligned(&*data);
+                    let data: BumpAllocators = bytemuck::pod_read_unaligned(&data);
                     if data.failed != 0 {
                         if data.failed == 0x20 {
                             eprintln!(
diff --git a/vello/src/render.rs b/vello/src/render.rs
index 4918946ec..dd9ed7d77 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -490,7 +490,8 @@ impl Render {
             image_atlas: ResourceProxy::Image(image_atlas),
             out_image,
         });
-        if robust {
+        // TODO: This second check is a massive hack
+        if robust && !shaders.pathtag_is_cpu {
             recording.download(*bump_buf.as_buf().unwrap());
         }
         recording.free_resource(bump_buf);

From 0fd3097c1ef7129a7aebf408a9ca789143432d66 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Fri, 7 Jun 2024 17:57:15 +0100
Subject: [PATCH 07/18] Fixup some broken behaviour and add comments

---
 examples/scenes/src/mmark.rs | 3 ++-
 vello/src/lib.rs             | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/scenes/src/mmark.rs b/examples/scenes/src/mmark.rs
index 3deea8d79..d05a4d7bb 100644
--- a/examples/scenes/src/mmark.rs
+++ b/examples/scenes/src/mmark.rs
@@ -74,7 +74,8 @@ impl TestScene for MMark {
         let n = if c < 10 {
             (c + 1) * 1000
         } else {
-            ((c - 8) * 10000).min(120_000)
+            // The 190_000 scene can't be uploaded due to wgpu's limits
+            ((c - 8) * 10000).min(180_000)
         };
         self.resize(n);
         let mut rng = rand::thread_rng();
diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 9e6097100..2c459d6cd 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -489,7 +489,10 @@ impl Renderer {
                                 "Run failed but next run will be retried, reallocated in last run"
                             );
                         } else {
-                            eprintln!("Previous run failed, need to reallocate");
+                            eprintln!(
+                                "Previous run failed, need to reallocate: {:x?}",
+                                data.failed
+                            );
                             // TODO: Be smarter here, e.g. notice that we're over by a certain factor
                             // and bump several buffers?
 
@@ -505,7 +508,8 @@ impl Renderer {
                             }
                             // if data.blend > self.bump_sizes.? // TODO
                             if data.ptcl > self.bump_sizes.ptcl.len() {
-                                self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 5 / 4);
+                                // TODO: At 5/4, this doesn't work very well
+                                self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2);
                             }
                             if data.seg_counts > self.bump_sizes.seg_counts.len() {
                                 self.bump_sizes.seg_counts =

From fe2d6b9a83182d920ef27ce175f79310dc06299c Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Mon, 10 Jun 2024 15:42:34 +0100
Subject: [PATCH 08/18] Push some debug logging and a few misc improvements

---
 vello/src/lib.rs                  | 55 +++++++++++++++++++++++++++----
 vello/src/render.rs               | 10 +++++-
 vello_encoding/src/config.rs      | 14 ++++----
 vello_shaders/shader/coarse.wgsl  |  5 +--
 vello_shaders/shader/prepare.wgsl |  5 +--
 5 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 2c459d6cd..c8d787a4f 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -484,43 +484,86 @@ impl Renderer {
                     let data = slice.get_mapped_range();
                     let data: BumpAllocators = bytemuck::pod_read_unaligned(&data);
                     if data.failed != 0 {
-                        if data.failed == 0x20 {
-                            eprintln!(
+                        if data.failed & 0x20 != 0 {
+                            log::debug!(
                                 "Run failed but next run will be retried, reallocated in last run"
                             );
                         } else {
-                            eprintln!(
-                                "Previous run failed, need to reallocate: {:x?}",
-                                data.failed
-                            );
+                            // log::info!(
+                            //     "Previous run failed, need to reallocate: {:x?}",
+                            //     data.failed
+                            // );
+                            // log::debug!("{:?}", data);
                             // TODO: Be smarter here, e.g. notice that we're over by a certain factor
                             // and bump several buffers?
 
+                            let mut changed = false;
                             // TODO: Also reduce allocation sizes
                             // TODO: Free buffers which haven't been used in "a while"
 
                             // TODO: This ignore the draw tag length?
                             if data.binning > self.bump_sizes.bin_data.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing binning from {:?} to {:?}",
+                                    self.bump_sizes.bin_data,
+                                    data.binning
+                                );
                                 self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2);
                             }
                             if data.lines > self.bump_sizes.lines.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing lines from {:?} to {:?}",
+                                    self.bump_sizes.lines,
+                                    data.lines
+                                );
                                 self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4);
                             }
                             // if data.blend > self.bump_sizes.? // TODO
                             if data.ptcl > self.bump_sizes.ptcl.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing ptcl from {:?} to {:?}",
+                                    self.bump_sizes.ptcl,
+                                    data.ptcl
+                                );
                                 // TODO: At 5/4, this doesn't work very well
                                 self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2);
                             }
                             if data.seg_counts > self.bump_sizes.seg_counts.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing seg_counts from {:?} to {:?}",
+                                    self.bump_sizes.seg_counts,
+                                    data.seg_counts
+                                );
                                 self.bump_sizes.seg_counts =
                                     BufferSize::new(data.seg_counts * 5 / 4);
                             }
                             if data.tile > self.bump_sizes.tiles.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing tiles from {:?} to {:?}",
+                                    self.bump_sizes.tiles,
+                                    data.tile
+                                );
                                 self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4);
                             }
                             if data.segments > self.bump_sizes.segments.len() {
+                                changed = true;
+                                log::debug!(
+                                    "Resizing segments from {:?} to {:?}",
+                                    self.bump_sizes.segments,
+                                    data.segments
+                                );
                                 self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4);
                             }
+                            if !changed {
+                                log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed);
+                            } else {
+                                log::info!("Detected need for reallocation, and did reallocate {:x?}. Data {data:?}", data.failed);
+                            }
                         }
                     }
                 }
diff --git a/vello/src/render.rs b/vello/src/render.rs
index dd9ed7d77..8c441df5c 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -180,6 +180,13 @@ impl Render {
             &params.base_color,
             bump_sizes,
         );
+        // log::debug!("Config: {{ lines_size: {:?}, binning_size: {:?}, tiles_size: {:?}, seg_counts_size: {:?}, segments_size: {:?}, ptcl_size: {:?} }}",
+        // cpu_config.gpu.lines_size,
+        // cpu_config.gpu.binning_size,
+        // cpu_config.gpu.tiles_size,
+        // cpu_config.gpu.seg_counts_size,
+        // cpu_config.gpu.segments_size,
+        // cpu_config.gpu.ptcl_size);
         let buffer_sizes = &cpu_config.buffer_sizes;
         let wg_counts = &cpu_config.workgroup_counts;
 
@@ -195,7 +202,8 @@ impl Render {
             recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)),
         );
         let info_bin_data_buf = ResourceProxy::new_buf(
-            buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64,
+            buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64
+                + (layout.bin_data_start as u64) * std::mem::size_of::<u32>() as u64,
             "info_bin_data_buf",
         );
         let tile_buf = ResourceProxy::new_buf(
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index 24e0aae3c..68ef34d9c 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -194,7 +194,7 @@ impl RenderConfig {
                 target_height: height,
                 base_color: base_color.to_premul_u32(),
                 lines_size: buffer_sizes.bump_buffers.lines.len(),
-                binning_size: buffer_sizes.bump_buffers.bin_data.len() - layout.bin_data_start,
+                binning_size: buffer_sizes.bump_buffers.bin_data.len(),
                 tiles_size: buffer_sizes.bump_buffers.tiles.len(),
                 seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(),
                 segments_size: buffer_sizes.bump_buffers.segments.len(),
@@ -380,12 +380,12 @@ impl BumpBufferSizes {
         // The following buffer sizes have been hand picked to accommodate the vello test scenes as
         // well as paris-30k. These should instead get derived from the scene layout using
         // reasonable heuristics.
-        let bin_data = BufferSize::new(1 << 18);
-        let tiles = BufferSize::new(1 << 21);
-        let lines = BufferSize::new(1 << 21);
-        let seg_counts = BufferSize::new(1 << 21);
-        let segments = BufferSize::new(1 << 21);
-        let ptcl = BufferSize::new(1 << 23);
+        let bin_data = BufferSize::new(1 << 12);
+        let tiles = BufferSize::new(1 << 15);
+        let lines = BufferSize::new(1 << 15);
+        let seg_counts = BufferSize::new(1 << 15);
+        let segments = BufferSize::new(1 << 15);
+        let ptcl = BufferSize::new(1 << 17);
         // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills.
         let blend_spill = BufferSize::new(1 << 20);
         BumpBufferSizes {
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index ba0edfa8f..19e4e240f 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -68,10 +68,7 @@ var<private> cmd_limit: u32;
 // Make sure there is space for a command of given size, plus a jump if needed
 fn alloc_cmd(size: u32) {
     if cmd_offset + size >= cmd_limit {
-        // We might be able to save a little bit of computation here
-        // by setting the initial value of the bump allocator.
-        let ptcl_dyn_start = config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
-        var new_cmd = ptcl_dyn_start + atomicAdd(&bump.ptcl, PTCL_INCREMENT);
+        var new_cmd = atomicAdd(&bump.ptcl, PTCL_INCREMENT);
         if new_cmd + PTCL_INCREMENT > config.ptcl_size {
             // This sets us up for technical UB, as lots of threads will be writing
             // to the same locations. But I think it's fine, and predicating the
diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl
index 93ed8c447..412113288 100644
--- a/vello_shaders/shader/prepare.wgsl
+++ b/vello_shaders/shader/prepare.wgsl
@@ -7,6 +7,7 @@
 
 #import config
 #import bump
+#import ptcl
 
 @group(0) @binding(0)
 var<storage, read_write> config: Config;
@@ -19,7 +20,7 @@ var<storage, read_write> bump: BumpAllocators;
 fn main() {
     var should_cancel = false;
     let previous_failure = atomicLoad(&bump.failed);
-    if previous_failure == PREVIOUS_RUN {
+    if (previous_failure & PREVIOUS_RUN) != 0 {
         // Don't early-exit from multiple frames in a row
         // The CPU should be blocking on the frame which failed anyway, so this
         // case should never be reached, but if the CPU side isn't doing that
@@ -62,7 +63,7 @@ fn main() {
         }
     }
     atomicStore(&bump.binning, 0u);
-    atomicStore(&bump.ptcl, 0u);
+    atomicStore(&bump.ptcl, config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC);
     atomicStore(&bump.tile, 0u);
     atomicStore(&bump.seg_counts, 0u);
     atomicStore(&bump.segments, 0u);

From cc010176ee499201f751ca03885381f30691d898 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:58:14 +0100
Subject: [PATCH 09/18] Fixup handling to not be busted

---
 examples/scenes/src/mmark.rs            |   2 +-
 vello/src/lib.rs                        |  75 +++++++++------
 vello/src/render.rs                     |  15 ++-
 vello/src/wgpu_engine.rs                |  10 ++
 vello_encoding/src/config.rs            | 123 ++++++++++++------------
 vello_encoding/src/lib.rs               |   4 +-
 vello_encoding/src/resolve.rs           |   2 +
 vello_shaders/shader/coarse.wgsl        |  10 +-
 vello_shaders/shader/prepare.wgsl       |   9 +-
 vello_shaders/shader/shared/config.wgsl |   3 +
 vello_shaders/src/cpu.rs                |   2 +-
 vello_shaders/src/cpu/coarse.rs         |   4 +-
 12 files changed, 144 insertions(+), 115 deletions(-)

diff --git a/examples/scenes/src/mmark.rs b/examples/scenes/src/mmark.rs
index d05a4d7bb..042484bbb 100644
--- a/examples/scenes/src/mmark.rs
+++ b/examples/scenes/src/mmark.rs
@@ -75,7 +75,7 @@ impl TestScene for MMark {
             (c + 1) * 1000
         } else {
             // The 190_000 scene can't be uploaded due to wgpu's limits
-            ((c - 8) * 10000).min(180_000)
+            ((c - 8) * 10000).min(190_000)
         };
         self.resize(n);
         let mut rng = rand::thread_rng();
diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index c8d787a4f..3215bfd74 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -106,7 +106,6 @@ pub use skrifa;
 
 pub mod glyph;
 
-use vello_encoding::{BufferSize, BumpBufferSizes};
 #[cfg(feature = "wgpu")]
 pub use wgpu;
 
@@ -264,7 +263,7 @@ pub struct Renderer {
     bump: Option<Buffer>,
     previous_submission: Option<BumpSubmission>,
     previouser_submission: Option<BumpSubmission>,
-    bump_sizes: BumpBufferSizes,
+    bump_sizes: BumpAllocators,
     #[cfg(feature = "wgpu-profiler")]
     pub profiler: GpuProfiler,
     #[cfg(feature = "wgpu-profiler")]
@@ -366,7 +365,7 @@ impl Renderer {
             bump: None,
             previous_submission: None,
             previouser_submission: None,
-            bump_sizes: Default::default(),
+            bump_sizes: BumpAllocators::default(),
             // Use 3 pending frames
             #[cfg(feature = "wgpu-profiler")]
             profiler: GpuProfiler::new(GpuProfilerSettings {
@@ -500,64 +499,76 @@ impl Renderer {
                             let mut changed = false;
                             // TODO: Also reduce allocation sizes
                             // TODO: Free buffers which haven't been used in "a while"
+                            // TODO: We should have awareness of the maximum binding size supported by the device
+                            // That's easy for all buffers but lines and ptcl
 
-                            // TODO: This ignore the draw tag length?
-                            if data.binning > self.bump_sizes.bin_data.len() {
+                            if data.binning > self.bump_sizes.binning {
                                 changed = true;
+                                let new_size = data.binning * 5 / 4;
                                 log::debug!(
-                                    "Resizing binning from {:?} to {:?}",
-                                    self.bump_sizes.bin_data,
-                                    data.binning
+                                    "Resizing binning to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.binning,
+                                    self.bump_sizes.binning,
                                 );
-                                self.bump_sizes.bin_data = BufferSize::new(data.binning * 3 / 2);
+                                self.bump_sizes.binning = new_size;
                             }
-                            if data.lines > self.bump_sizes.lines.len() {
+                            if data.lines > self.bump_sizes.lines {
                                 changed = true;
+                                let new_size = data.lines * 5 / 4;
                                 log::debug!(
-                                    "Resizing lines from {:?} to {:?}",
+                                    "Resizing lines to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.lines,
                                     self.bump_sizes.lines,
-                                    data.lines
                                 );
-                                self.bump_sizes.lines = BufferSize::new(data.lines * 5 / 4);
+                                self.bump_sizes.lines = new_size;
                             }
                             // if data.blend > self.bump_sizes.? // TODO
-                            if data.ptcl > self.bump_sizes.ptcl.len() {
+                            if data.ptcl > self.bump_sizes.ptcl {
                                 changed = true;
+                                // TODO: At 5/4, this doesn't work very well
+                                let new_size = data.ptcl * 5 / 4;
                                 log::debug!(
-                                    "Resizing ptcl from {:?} to {:?}",
+                                    "Resizing ptcl to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.ptcl,
                                     self.bump_sizes.ptcl,
-                                    data.ptcl
                                 );
-                                // TODO: At 5/4, this doesn't work very well
-                                self.bump_sizes.ptcl = BufferSize::new(data.ptcl * 3 / 2);
+                                self.bump_sizes.ptcl = new_size;
                             }
-                            if data.seg_counts > self.bump_sizes.seg_counts.len() {
+                            if data.seg_counts > self.bump_sizes.seg_counts {
                                 changed = true;
+                                let new_size = data.seg_counts * 5 / 4;
                                 log::debug!(
-                                    "Resizing seg_counts from {:?} to {:?}",
+                                    "Resizing seg_counts to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.seg_counts,
                                     self.bump_sizes.seg_counts,
-                                    data.seg_counts
                                 );
-                                self.bump_sizes.seg_counts =
-                                    BufferSize::new(data.seg_counts * 5 / 4);
+                                self.bump_sizes.seg_counts = new_size;
                             }
-                            if data.tile > self.bump_sizes.tiles.len() {
+                            if data.tile > self.bump_sizes.tile {
                                 changed = true;
+                                let new_size = data.tile * 5 / 4;
                                 log::debug!(
-                                    "Resizing tiles from {:?} to {:?}",
-                                    self.bump_sizes.tiles,
-                                    data.tile
+                                    "Resizing tile to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.tile,
+                                    self.bump_sizes.tile,
                                 );
-                                self.bump_sizes.tiles = BufferSize::new(data.tile * 5 / 4);
+                                self.bump_sizes.tile = new_size;
                             }
-                            if data.segments > self.bump_sizes.segments.len() {
+                            if data.segments > self.bump_sizes.segments {
                                 changed = true;
+                                let new_size = data.segments * 5 / 4;
                                 log::debug!(
-                                    "Resizing segments from {:?} to {:?}",
+                                    "Resizing segments to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.segments,
                                     self.bump_sizes.segments,
-                                    data.segments
                                 );
-                                self.bump_sizes.segments = BufferSize::new(data.segments * 5 / 4);
+                                self.bump_sizes.segments = new_size;
                             }
                             if !changed {
                                 log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed);
diff --git a/vello/src/render.rs b/vello/src/render.rs
index 8c441df5c..c45ba6296 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -14,7 +14,7 @@ use crate::{AaConfig, RenderParams};
 use crate::Scene;
 
 use vello_encoding::{
-    make_mask_lut, make_mask_lut_16, BumpBufferSizes, Encoding, Resolver, WorkgroupSize,
+    make_mask_lut, make_mask_lut_16, BumpAllocators, Encoding, Resolver, WorkgroupSize,
 };
 
 /// State for a render in progress.
@@ -84,7 +84,7 @@ pub(crate) fn render_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
-    bump_sizes: BumpBufferSizes,
+    bump_sizes: BumpAllocators,
 ) -> (Recording, ImageProxy, BufferProxy) {
     render_encoding_full(scene.encoding(), resolver, shaders, params, bump_sizes)
 }
@@ -99,7 +99,7 @@ pub(crate) fn render_encoding_full(
     resolver: &mut Resolver,
     shaders: &FullShaders,
     params: &RenderParams,
-    bump_sizes: BumpBufferSizes,
+    bump_sizes: BumpAllocators,
 ) -> (Recording, ImageProxy, BufferProxy) {
     let mut render = Render::new();
     let mut recording =
@@ -137,7 +137,7 @@ impl Render {
         resolver: &mut Resolver,
         shaders: &FullShaders,
         params: &RenderParams,
-        bump_sizes: BumpBufferSizes,
+        bump_sizes: BumpAllocators,
         robust: bool,
     ) -> Recording {
         use vello_encoding::RenderConfig;
@@ -174,7 +174,7 @@ impl Render {
             recording.write_image(image_atlas, image.1, image.2, image.0.clone());
         }
         let cpu_config = RenderConfig::new(
-            &layout,
+            layout,
             params.width,
             params.height,
             &params.base_color,
@@ -202,12 +202,11 @@ impl Render {
             recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)),
         );
         let info_bin_data_buf = ResourceProxy::new_buf(
-            buffer_sizes.bump_buffers.bin_data.size_in_bytes() as u64
-                + (layout.bin_data_start as u64) * std::mem::size_of::<u32>() as u64,
+            buffer_sizes.bump_buffers.binning.size_in_bytes() as u64,
             "info_bin_data_buf",
         );
         let tile_buf = ResourceProxy::new_buf(
-            buffer_sizes.bump_buffers.tiles.size_in_bytes().into(),
+            buffer_sizes.bump_buffers.tile.size_in_bytes().into(),
             "tile_buf",
         );
         let segments_buf = ResourceProxy::new_buf(
diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs
index adb2ec8ed..d53044540 100644
--- a/vello/src/wgpu_engine.rs
+++ b/vello/src/wgpu_engine.rs
@@ -952,6 +952,16 @@ impl ResourcePool {
         device: &Device,
     ) -> Buffer {
         let rounded_size = Self::size_class(size, SIZE_CLASS_BITS);
+        // let max_storage_buffer_binding_size =
+        //     device.limits().max_storage_buffer_binding_size.into();
+        // if rounded_size > max_storage_buffer_binding_size {
+        //     if size < max_storage_buffer_binding_size {
+        //         log::warn!("Would allocate buffer {name} to be larger than {max_storage_buffer_binding_size}. Clamped");
+        //         rounded_size = max_storage_buffer_binding_size;
+        //     } else {
+        //         log::warn!("Would allocate buffer {name} to be larger than {max_storage_buffer_binding_size}, which is not allowed");
+        //     }
+        // }
         let props = BufferProperties {
             size: rounded_size,
             usages: usage,
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index 68ef34d9c..6aeca0d46 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -14,6 +14,7 @@ const TILE_WIDTH: u32 = 16;
 const TILE_HEIGHT: u32 = 16;
 
 // TODO: Obtain these from the vello_shaders crate
+pub const PTCL_INITIAL_ALLOC: u32 = 64;
 pub(crate) const PATH_REDUCE_WG: u32 = 256;
 const PATH_BBOX_WG: u32 = 256;
 const FLATTEN_WG: u32 = 256;
@@ -22,6 +23,9 @@ const CLIP_REDUCE_WG: u32 = 256;
 /// Counters for tracking dynamic allocation on the GPU.
 ///
 /// This must be kept in sync with the struct in `shader/shared/bump.wgsl`
+///
+/// These values do *not* include any pre-allocated sections which use the same
+/// underlying buffers.
 #[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
 #[repr(C)]
 pub struct BumpAllocators {
@@ -37,9 +41,8 @@ pub struct BumpAllocators {
     pub lines: u32,
 }
 
-#[derive(Default)]
+#[derive(Default, Copy, Clone, Debug)]
 pub struct BumpAllocatorMemory {
-    pub total: u32,
     pub binning: BufferSize<u32>,
     pub ptcl: BufferSize<u32>,
     pub tile: BufferSize<Tile>,
@@ -49,20 +52,14 @@ pub struct BumpAllocatorMemory {
 }
 
 impl BumpAllocators {
-    pub fn memory(&self) -> BumpAllocatorMemory {
-        let binning = BufferSize::new(self.binning);
-        let ptcl = BufferSize::new(self.ptcl);
+    pub fn memory(&self, layout: &Layout) -> BumpAllocatorMemory {
+        let binning = BufferSize::new(self.binning + layout.bin_data_start);
+        let ptcl = BufferSize::new(self.ptcl + layout.ptcl_dyn_start);
         let tile = BufferSize::new(self.tile);
         let seg_counts = BufferSize::new(self.seg_counts);
         let segments = BufferSize::new(self.segments);
         let lines = BufferSize::new(self.lines);
         BumpAllocatorMemory {
-            total: binning.size_in_bytes()
-                + ptcl.size_in_bytes()
-                + tile.size_in_bytes()
-                + seg_counts.size_in_bytes()
-                + segments.size_in_bytes()
-                + lines.size_in_bytes(),
             binning,
             ptcl,
             tile,
@@ -73,8 +70,26 @@ impl BumpAllocators {
     }
 }
 
+impl BumpAllocatorMemory {
+    /// Get the total memory used by the bump buffers
+    pub fn total(&self) -> u64 {
+        [
+            self.binning.size_in_bytes(),
+            self.ptcl.size_in_bytes(),
+            self.tile.size_in_bytes(),
+            self.seg_counts.size_in_bytes(),
+            self.segments.size_in_bytes(),
+            self.lines.size_in_bytes(),
+        ]
+        .into_iter()
+        .map(u64::from)
+        .sum()
+    }
+}
+
 impl std::fmt::Display for BumpAllocatorMemory {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let total = self.total();
         write!(
             f,
             "\n \
@@ -85,9 +100,9 @@ impl std::fmt::Display for BumpAllocatorMemory {
                  \tSegment Counts:\t\t{} elements ({} bytes)\n\
                  \tSegments:\t\t{} elements ({} bytes)\n\
                  \tLines:\t\t\t{} elements ({} bytes)",
-            self.total,
-            self.total as f32 / (1 << 10) as f32,
-            self.total as f32 / (1 << 20) as f32,
+            total,
+            total as f32 / (1 << 10) as f32,
+            total as f32 / (1 << 20) as f32,
             self.binning.len(),
             self.binning.size_in_bytes(),
             self.ptcl.len(),
@@ -138,6 +153,7 @@ pub struct ConfigUniform {
     /// Layout of packed scene data.
     pub layout: Layout,
     /// Whether this stage has been cancelled at startup due to a predicted
+    /// memory allocation failure.
     ///
     /// Will be set by the `prepare` stage, and so should always be 0 on CPU.
     pub cancelled: u32,
@@ -171,20 +187,21 @@ pub struct RenderConfig {
 
 impl RenderConfig {
     pub fn new(
-        layout: &Layout,
+        mut layout: Layout,
         width: u32,
         height: u32,
         base_color: &peniko::Color,
-        bump_buffers: BumpBufferSizes,
+        bump_buffers: BumpAllocators,
     ) -> Self {
         let new_width = width.next_multiple_of(TILE_WIDTH);
         let new_height = height.next_multiple_of(TILE_HEIGHT);
         let width_in_tiles = new_width / TILE_WIDTH;
         let height_in_tiles = new_height / TILE_HEIGHT;
+        layout.ptcl_dyn_start = width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
         let n_path_tags = layout.path_tags_size();
         let workgroup_counts =
-            WorkgroupCounts::new(layout, width_in_tiles, height_in_tiles, n_path_tags);
-        let buffer_sizes = BufferSizes::new(layout, &workgroup_counts, bump_buffers);
+            WorkgroupCounts::new(&layout, width_in_tiles, height_in_tiles, n_path_tags);
+        let buffer_sizes = BufferSizes::new(&layout, &workgroup_counts, bump_buffers);
         Self {
             gpu: ConfigUniform {
                 width_in_tiles,
@@ -193,14 +210,14 @@ impl RenderConfig {
                 target_width: width,
                 target_height: height,
                 base_color: base_color.to_premul_u32(),
-                lines_size: buffer_sizes.bump_buffers.lines.len(),
-                binning_size: buffer_sizes.bump_buffers.bin_data.len(),
-                tiles_size: buffer_sizes.bump_buffers.tiles.len(),
-                seg_counts_size: buffer_sizes.bump_buffers.seg_counts.len(),
-                segments_size: buffer_sizes.bump_buffers.segments.len(),
-                blend_size: buffer_sizes.bump_buffers.blend_spill.len(),
-                ptcl_size: buffer_sizes.bump_buffers.ptcl.len(),
-                layout: *layout,
+                lines_size: bump_buffers.lines,
+                binning_size: bump_buffers.binning,
+                tiles_size: bump_buffers.tile,
+                seg_counts_size: bump_buffers.seg_counts,
+                segments_size: bump_buffers.segments,
+                blend_size: buffer_sizes.bump_buffers.blend_spill,
+                ptcl_size: bump_buffers.ptcl,
+                layout,
             },
             workgroup_counts,
             buffer_sizes,
@@ -339,17 +356,6 @@ impl<T: Sized> PartialOrd for BufferSize<T> {
         self.len.partial_cmp(&other.len)
     }
 }
-#[derive(Copy, Clone, Debug)]
-pub struct BumpBufferSizes {
-    pub lines: BufferSize<LineSoup>,
-    pub bin_data: BufferSize<u32>,
-    pub tiles: BufferSize<Tile>,
-    pub seg_counts: BufferSize<SegmentCount>,
-    pub segments: BufferSize<PathSegment>,
-    pub ptcl: BufferSize<u32>,
-    pub blend_spill: BufferSize<u32>,
-}
-
 /// Computed sizes for all buffers.
 #[derive(Copy, Clone, Debug, Default)]
 pub struct BufferSizes {
@@ -372,45 +378,41 @@ pub struct BufferSizes {
     pub bin_headers: BufferSize<BinHeader>,
     pub paths: BufferSize<Path>,
     // Bump allocated buffers
-    pub bump_buffers: BumpBufferSizes,
+    pub bump_buffers: BumpAllocatorMemory,
 }
 
-impl BumpBufferSizes {
-    fn new() -> Self {
-        // The following buffer sizes have been hand picked to accommodate the vello test scenes as
-        // well as paris-30k. These should instead get derived from the scene layout using
-        // reasonable heuristics.
-        let bin_data = BufferSize::new(1 << 12);
-        let tiles = BufferSize::new(1 << 15);
-        let lines = BufferSize::new(1 << 15);
-        let seg_counts = BufferSize::new(1 << 15);
-        let segments = BufferSize::new(1 << 15);
-        let ptcl = BufferSize::new(1 << 17);
+impl BumpAllocators {
+    /// The initial sizes which should be used for the bump buffers.
+    pub fn initial_sizes() -> Self {
+        // The following buffer sizes have been picked to accommodate small scenes
+        // and which will grow as needed.
+        let binning = 1 << 12;
+        let tile = 1 << 15;
+        let lines = 1 << 15;
+        let seg_counts = 1 << 15;
+        let segments = 1 << 15;
+        let ptcl = 1 << 17;
         // 16 * 16 (1 << 8) is one blend spill, so this allows for 4096 spills.
-        let blend_spill = BufferSize::new(1 << 20);
-        BumpBufferSizes {
-            bin_data,
+        let blend_spill = 1 << 20;
+        BumpAllocators {
+            binning,
             lines,
             ptcl,
             seg_counts,
             segments,
-            tiles,
+            tile,
             blend_spill,
+            blend: 0,
+            failed: 0,
         }
     }
 }
 
-impl Default for BumpBufferSizes {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl BufferSizes {
     pub fn new(
         layout: &Layout,
         workgroups: &WorkgroupCounts,
-        bump_buffers: BumpBufferSizes,
+        bump_buffers: BumpAllocators,
     ) -> Self {
         let n_paths = layout.n_paths;
         let n_draw_objects = layout.n_draw_objects;
@@ -441,6 +443,7 @@ impl BufferSizes {
         let bin_headers = BufferSize::new(binning_wgs * 256);
         let n_paths_aligned = align_up(n_paths, 256);
         let paths = BufferSize::new(n_paths_aligned);
+        let bump_buffers = bump_buffers.memory(layout);
 
         Self {
             path_reduced,
diff --git a/vello_encoding/src/lib.rs b/vello_encoding/src/lib.rs
index 8e1109308..4c1831191 100644
--- a/vello_encoding/src/lib.rs
+++ b/vello_encoding/src/lib.rs
@@ -27,8 +27,8 @@ mod resolve;
 pub use binning::BinHeader;
 pub use clip::{Clip, ClipBbox, ClipBic, ClipElement};
 pub use config::{
-    BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, BumpBufferSizes, ConfigUniform,
-    IndirectCount, RenderConfig, WorkgroupCounts, WorkgroupSize,
+    BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, ConfigUniform, IndirectCount,
+    RenderConfig, WorkgroupCounts, WorkgroupSize, PTCL_INITIAL_ALLOC,
 };
 pub use draw::{
     DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid,
diff --git a/vello_encoding/src/resolve.rs b/vello_encoding/src/resolve.rs
index 626e79c84..e437c86c9 100644
--- a/vello_encoding/src/resolve.rs
+++ b/vello_encoding/src/resolve.rs
@@ -29,6 +29,8 @@ pub struct Layout {
     pub n_clips: u32,
     /// Start of binning data.
     pub bin_data_start: u32,
+    /// Start of dynamically allocated ptcl commands
+    pub ptcl_dyn_start: u32,
     /// Start of path tag stream.
     pub path_tag_base: u32,
     /// Start of path data stream.
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index 19e4e240f..a829405f9 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -76,6 +76,7 @@ fn alloc_cmd(size: u32) {
             new_cmd = 0u;
             atomicOr(&bump.failed, STAGE_COARSE);
         }
+        new_cmd += config.ptcl_dyn_start;
         ptcl[cmd_offset] = CMD_JUMP;
         ptcl[cmd_offset + 1u] = new_cmd;
         cmd_offset = new_cmd;
@@ -90,6 +91,11 @@ fn write_path(tile: Tile, tile_ix: u32, draw_flags: u32) {
     let n_segs = tile.segment_count_or_ix;
     if n_segs != 0u {
         var seg_ix = atomicAdd(&bump.segments, n_segs);
+        if seg_ix > config.segments_size {
+            // All writes into segments happen in path_tiling, so we don't need
+            // to avoid UB in this shader
+            atomicOr(&bump.failed, STAGE_COARSE);
+        }
         tiles[tile_ix].segment_count_or_ix = ~seg_ix;
         alloc_cmd(4u);
         ptcl[cmd_offset] = CMD_FILL;
@@ -157,7 +163,7 @@ fn main(
             failed |= STAGE_PATH_COUNT;
         }
         // Reuse sh_part_count to hold failed flag, shmem is tight
-        sh_part_count[0] = u32(failed);
+        sh_part_count[0] = failed;
     }
     let failed = workgroupUniformLoad(&sh_part_count[0]);
     if failed != 0u {
@@ -406,7 +412,7 @@ fn main(
                     case DRAWTAG_END_CLIP: {
                         clip_depth -= 1u;
                         // A clip shape is always a non-zero fill (draw_flags=0).
-                        write_path(tile, tile_ix, /*draw_flags=*/0u);
+                        write_path(tile, tile_ix, /* draw_flags,= */ 0u);
                         let blend = scene[dd];
                         let alpha = bitcast<f32>(scene[dd + 1u]);
                         write_end_clip(CmdEndClip(blend, alpha));
diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl
index 412113288..01006afd6 100644
--- a/vello_shaders/shader/prepare.wgsl
+++ b/vello_shaders/shader/prepare.wgsl
@@ -1,4 +1,4 @@
-// Copyright 2022 the Vello Authors
+// Copyright 2024 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 // Determine whether the Vello pipeline is likely to fail during this run
@@ -20,7 +20,7 @@ var<storage, read_write> bump: BumpAllocators;
 fn main() {
     var should_cancel = false;
     let previous_failure = atomicLoad(&bump.failed);
-    if (previous_failure & PREVIOUS_RUN) != 0 {
+    if (previous_failure & PREVIOUS_RUN) != 0u {
         // Don't early-exit from multiple frames in a row
         // The CPU should be blocking on the frame which failed anyway, so this
         // case should never be reached, but if the CPU side isn't doing that
@@ -32,9 +32,6 @@ fn main() {
         // If the previous frame failed (for another reason)
 
         // And we don't have enough memory to have run that previous frame
-        if config.lines_size < atomicLoad(&bump.lines) {
-            should_cancel = true;
-        }
         if config.binning_size < atomicLoad(&bump.binning) {
             should_cancel = true;
         }
@@ -63,7 +60,7 @@ fn main() {
         }
     }
     atomicStore(&bump.binning, 0u);
-    atomicStore(&bump.ptcl, config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC);
+    atomicStore(&bump.ptcl, 0u);
     atomicStore(&bump.tile, 0u);
     atomicStore(&bump.seg_counts, 0u);
     atomicStore(&bump.segments, 0u);
diff --git a/vello_shaders/shader/shared/config.wgsl b/vello_shaders/shader/shared/config.wgsl
index 95d68cc7c..c3e9ceed4 100644
--- a/vello_shaders/shader/shared/config.wgsl
+++ b/vello_shaders/shader/shared/config.wgsl
@@ -21,6 +21,9 @@ struct Config {
     // To reduce the number of bindings, info and bin data are combined
     // into one buffer.
     bin_data_start: u32,
+    // Each tile gets a pre-allocated area of ptcl; the dynamically allocated 
+    // section is after this.
+    ptcl_dyn_start: u32,
 
     // offsets within scene buffer (in u32 units)
     pathtag_base: u32,
diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs
index 30ca367f3..e0ecff483 100644
--- a/vello_shaders/src/cpu.rs
+++ b/vello_shaders/src/cpu.rs
@@ -169,7 +169,7 @@ pub struct CpuTexture {
 
 // Common internal definitions
 
-const PTCL_INITIAL_ALLOC: u32 = 64;
+pub use vello_encoding::PTCL_INITIAL_ALLOC;
 
 // Tags for PTCL commands
 const CMD_END: u32 = 0;
diff --git a/vello_shaders/src/cpu/coarse.rs b/vello_shaders/src/cpu/coarse.rs
index 88ec6036d..c28907ea3 100644
--- a/vello_shaders/src/cpu/coarse.rs
+++ b/vello_shaders/src/cpu/coarse.rs
@@ -52,10 +52,8 @@ impl TileState {
         ptcl: &mut [u32],
     ) {
         if self.cmd_offset + size >= self.cmd_limit {
-            let ptcl_dyn_start =
-                config.width_in_tiles * config.height_in_tiles * PTCL_INITIAL_ALLOC;
             let chunk_size = PTCL_INCREMENT.max(size + PTCL_HEADROOM);
-            let new_cmd = ptcl_dyn_start + bump.ptcl;
+            let new_cmd = config.layout.ptcl_dyn_start + bump.ptcl;
             bump.ptcl += chunk_size;
             ptcl[self.cmd_offset as usize] = CMD_JUMP;
             ptcl[self.cmd_offset as usize + 1] = new_cmd;

From 1928122d77b997ff90a96f416c0e5f390a0787f0 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Thu, 13 Jun 2024 16:06:37 +0100
Subject: [PATCH 10/18] Move subset to a helper function

---
 examples/with_winit/src/stats.rs      |   2 +-
 vello/src/lib.rs                      | 189 ++++++++++++++------------
 vello/src/render.rs                   |   2 +-
 vello_encoding/src/config.rs          |  10 +-
 vello_shaders/shader/shared/bump.wgsl |   2 +-
 vello_shaders/src/cpu/coarse.rs       |   4 +-
 vello_shaders/src/cpu/prepare.rs      |   2 +-
 7 files changed, 113 insertions(+), 98 deletions(-)

diff --git a/examples/with_winit/src/stats.rs b/examples/with_winit/src/stats.rs
index 3b5339dee..0524322f3 100644
--- a/examples/with_winit/src/stats.rs
+++ b/examples/with_winit/src/stats.rs
@@ -75,7 +75,7 @@ impl Snapshot {
             labels.push(format!("ptcl: {}", bump.ptcl));
             labels.push(format!("tile: {}", bump.tile));
             labels.push(format!("segments: {}", bump.segments));
-            labels.push(format!("blend: {}", bump.blend));
+            labels.push(format!("blend: {}", bump.blend_spill));
         }
 
         // height / 2 is dedicated to the text labels and the rest is filled by the bar graph.
diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 3215bfd74..a942660bf 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -89,12 +89,14 @@ mod shaders;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
 #[cfg(feature = "wgpu")]
-use std::{num::NonZeroUsize, sync::Arc};
+use std::{
+    num::NonZeroUsize,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
 
 /// Styling and composition primitives.
 pub use peniko;
@@ -416,7 +418,7 @@ impl Renderer {
         let external_resources: &[ExternalResource] = if self.options.use_cpu {
             // HACK: Our handling of buffers across CPU and GPU is not great
 
-            // We don't retrain the bump buffer if we're using CPU shaders
+            // We don't retain the bump buffer if we're using CPU shaders
             // This is because some of stages might still be running on the
             // GPU, and we can't easily get the bump buffer back to the CPU
             // from the GPU
@@ -468,6 +470,97 @@ impl Renderer {
         surface: &SurfaceTexture,
         params: &RenderParams,
     ) -> Result<()> {
+        let buffer_completed = self.block_on_bump_and_reallocate(device);
+
+        let width = params.width;
+        let height = params.height;
+        let mut target = self
+            .target
+            .take()
+            .unwrap_or_else(|| TargetTexture::new(device, width, height));
+        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
+        // during resize, specifically on metal.
+        if target.width != width || target.height != height {
+            target = TargetTexture::new(device, width, height);
+        }
+        let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?;
+        let blit = self
+            .blit
+            .as_ref()
+            .expect("renderer should have configured surface_format to use on a surface");
+        let mut encoder =
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        let mut recording = Recording::default();
+        let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format));
+        let surface_proxy = ImageProxy::new(
+            width,
+            height,
+            ImageFormat::from_wgpu(surface.texture.format()),
+        );
+        recording.draw(recording::DrawParams {
+            shader_id: blit.0,
+            instance_count: 1,
+            vertex_count: 6,
+            vertex_buffer: None,
+            resources: vec![ResourceProxy::Image(target_proxy)],
+            target: surface_proxy,
+            clear_color: Some([0., 0., 0., 0.]),
+        });
+        let surface_view = surface
+            .texture
+            .create_view(&wgpu::TextureViewDescriptor::default());
+        let external_resources = [
+            ExternalResource::Image(target_proxy, &target.view),
+            ExternalResource::Image(surface_proxy, &surface_view),
+        ];
+        self.engine.run_recording(
+            device,
+            queue,
+            &recording,
+            &external_resources,
+            "blit (render_to_surface)",
+            #[cfg(feature = "wgpu-profiler")]
+            &mut self.profiler,
+        )?;
+        #[cfg(feature = "wgpu-profiler")]
+        self.profiler.resolve_queries(&mut encoder);
+        if let Some(download) = &bump_download {
+            let completed = buffer_completed.clone();
+            download
+                .slice(..)
+                .map_async(wgpu::MapMode::Read, move |res| match res {
+                    Ok(()) => {
+                        completed.store(true, Ordering::Release);
+                    }
+                    Err(e) => {
+                        log::warn!("Failed to map bump buffer: {e}");
+                    }
+                });
+        };
+        let idx = queue.submit(Some(encoder.finish()));
+        if let Some(download) = bump_download {
+            self.previouser_submission =
+                self.previous_submission
+                    .replace((idx, download, buffer_completed));
+        } else {
+            self.previouser_submission = self.previous_submission.take();
+        }
+        self.target = Some(target);
+        #[cfg(feature = "wgpu-profiler")]
+        {
+            self.profiler.end_frame().unwrap();
+            if let Some(result) = self
+                .profiler
+                .process_finished_frame(queue.get_timestamp_period())
+            {
+                self.profile_result = Some(result);
+            }
+        }
+        Ok(())
+    }
+
+    /// Wait for the frame "two frames ago"'s bump buffer to be available, and reallocate if so.
+    fn block_on_bump_and_reallocate(&mut self, device: &Device) -> Arc<AtomicBool> {
         let buffer_completed = if let Some((idx, bump, completed)) =
             self.previouser_submission.take()
         {
@@ -587,89 +680,7 @@ impl Renderer {
         } else {
             Arc::new(AtomicBool::new(false))
         };
-
-        let width = params.width;
-        let height = params.height;
-        let mut target = self
-            .target
-            .take()
-            .unwrap_or_else(|| TargetTexture::new(device, width, height));
-        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
-        // during resize, specifically on metal.
-        if target.width != width || target.height != height {
-            target = TargetTexture::new(device, width, height);
-        }
-        let bump_download = self.render_to_texture(device, queue, scene, &target.view, params)?;
-        let blit = self
-            .blit
-            .as_ref()
-            .expect("renderer should have configured surface_format to use on a surface");
-        let mut recording = Recording::default();
-        let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format));
-        let surface_proxy = ImageProxy::new(
-            width,
-            height,
-            ImageFormat::from_wgpu(surface.texture.format()),
-        );
-        recording.draw(recording::DrawParams {
-            shader_id: blit.0,
-            instance_count: 1,
-            vertex_count: 6,
-            vertex_buffer: None,
-            resources: vec![ResourceProxy::Image(target_proxy)],
-            target: surface_proxy,
-            clear_color: Some([0., 0., 0., 0.]),
-        });
-
-        let surface_view = surface
-            .texture
-            .create_view(&wgpu::TextureViewDescriptor::default());
-        let external_resources = [
-            ExternalResource::Image(target_proxy, &target.view),
-            ExternalResource::Image(surface_proxy, &surface_view),
-        ];
-        self.engine.run_recording(
-            device,
-            queue,
-            &recording,
-            &external_resources,
-            "blit (render_to_surface)",
-            #[cfg(feature = "wgpu-profiler")]
-            &mut self.profiler,
-        );
-        if let Some(download) = &bump_download {
-            let completed = buffer_completed.clone();
-            download
-                .slice(..)
-                .map_async(wgpu::MapMode::Read, move |res| match res {
-                    Ok(()) => {
-                        completed.store(true, Ordering::Release);
-                    }
-                    Err(e) => {
-                        log::warn!("Failed to map bump buffer: {e}");
-                    }
-                });
-        };
-        let idx = queue.submit(Some(encoder.finish()));
-        if let Some(download) = bump_download {
-            self.previouser_submission =
-                self.previous_submission
-                    .replace((idx, download, buffer_completed));
-        } else {
-            self.previouser_submission = self.previous_submission.take();
-        }
-        self.target = Some(target);
-        #[cfg(feature = "wgpu-profiler")]
-        {
-            self.profiler.end_frame().unwrap();
-            if let Some(result) = self
-                .profiler
-                .process_finished_frame(queue.get_timestamp_period())
-            {
-                self.profile_result = Some(result);
-            }
-        }
-        Ok(())
+        buffer_completed
     }
 
     /// Reload the shaders. This should only be used during `vello` development
diff --git a/vello/src/render.rs b/vello/src/render.rs
index c45ba6296..1be4c18bd 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -480,7 +480,7 @@ impl Render {
         recording.free_resource(path_buf);
         let out_image = ImageProxy::new(params.width, params.height, ImageFormat::Rgba8);
         let blend_spill_buf = BufferProxy::new(
-            buffer_sizes.blend_spill.size_in_bytes().into(),
+            buffer_sizes.bump_buffers.blend_spill.size_in_bytes().into(),
             "blend_spill",
         );
         self.fine_wg_count = Some(wg_counts.fine);
diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs
index 6aeca0d46..7226090f4 100644
--- a/vello_encoding/src/config.rs
+++ b/vello_encoding/src/config.rs
@@ -37,7 +37,7 @@ pub struct BumpAllocators {
     pub tile: u32,
     pub seg_counts: u32,
     pub segments: u32,
-    pub blend: u32,
+    pub blend_spill: u32,
     pub lines: u32,
 }
 
@@ -48,6 +48,7 @@ pub struct BumpAllocatorMemory {
     pub tile: BufferSize<Tile>,
     pub seg_counts: BufferSize<SegmentCount>,
     pub segments: BufferSize<PathSegment>,
+    pub blend_spill: BufferSize<u32>,
     pub lines: BufferSize<LineSoup>,
 }
 
@@ -59,12 +60,14 @@ impl BumpAllocators {
         let seg_counts = BufferSize::new(self.seg_counts);
         let segments = BufferSize::new(self.segments);
         let lines = BufferSize::new(self.lines);
+        let blend_spill = BufferSize::new(self.blend_spill);
         BumpAllocatorMemory {
             binning,
             ptcl,
             tile,
             seg_counts,
             segments,
+            blend_spill,
             lines,
         }
     }
@@ -79,6 +82,7 @@ impl BumpAllocatorMemory {
             self.tile.size_in_bytes(),
             self.seg_counts.size_in_bytes(),
             self.segments.size_in_bytes(),
+            self.blend_spill.size_in_bytes(),
             self.lines.size_in_bytes(),
         ]
         .into_iter()
@@ -215,7 +219,7 @@ impl RenderConfig {
                 tiles_size: bump_buffers.tile,
                 seg_counts_size: bump_buffers.seg_counts,
                 segments_size: bump_buffers.segments,
-                blend_size: buffer_sizes.bump_buffers.blend_spill,
+                blend_size: bump_buffers.blend_spill,
                 ptcl_size: bump_buffers.ptcl,
                 layout,
             },
@@ -402,7 +406,6 @@ impl BumpAllocators {
             segments,
             tile,
             blend_spill,
-            blend: 0,
             failed: 0,
         }
     }
@@ -463,6 +466,7 @@ impl BufferSizes {
             indirect_count,
             bin_headers,
             paths,
+            bump_buffers,
         }
     }
 }
diff --git a/vello_shaders/shader/shared/bump.wgsl b/vello_shaders/shader/shared/bump.wgsl
index 54763d94f..1d732c04d 100644
--- a/vello_shaders/shader/shared/bump.wgsl
+++ b/vello_shaders/shader/shared/bump.wgsl
@@ -18,7 +18,7 @@ struct BumpAllocators {
     tile: atomic<u32>,
     seg_counts: atomic<u32>,
     segments: atomic<u32>,
-    blend: atomic<u32>,
+    blend_spill: atomic<u32>,
     lines: atomic<u32>,
 }
 
diff --git a/vello_shaders/src/cpu/coarse.rs b/vello_shaders/src/cpu/coarse.rs
index c28907ea3..e345ef138 100644
--- a/vello_shaders/src/cpu/coarse.rs
+++ b/vello_shaders/src/cpu/coarse.rs
@@ -354,8 +354,8 @@ fn coarse_main(
                 ptcl[tile_state.cmd_offset as usize] = CMD_END;
                 let scratch_size =
                     (max_blend_depth.saturating_sub(BLEND_STACK_SPLIT)) * TILE_WIDTH * TILE_HEIGHT;
-                ptcl[blend_offset as usize] = bump.blend;
-                bump.blend += scratch_size;
+                ptcl[blend_offset as usize] = bump.blend_spill;
+                bump.blend_spill += scratch_size;
             }
         }
     }
diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs
index 81750b0ac..26feaf79a 100644
--- a/vello_shaders/src/cpu/prepare.rs
+++ b/vello_shaders/src/cpu/prepare.rs
@@ -9,7 +9,7 @@ fn prepare_main(_config: &ConfigUniform, bump: &mut BumpAllocators) {
     bump.tile = 0;
     bump.seg_counts = 0;
     bump.segments = 0;
-    bump.blend = 0;
+    bump.blend_spill = 0;
     bump.lines = 0;
     bump.failed = 0;
 }

From 947408ec6edd1faeb65a73fb57cb5251ef15f179 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Mon, 17 Jun 2024 10:09:32 +0100
Subject: [PATCH 11/18] Move segments overflow checking to path_tiling_setup

This should slightly reduce the amount of work required
---
 vello/src/render.rs                         |  2 +-
 vello/src/shaders.rs                        |  2 +-
 vello_shaders/shader/coarse.wgsl            |  6 +-----
 vello_shaders/shader/path_tiling_setup.wgsl | 23 +++++++++++++++------
 vello_shaders/src/cpu/path_tiling_setup.rs  | 16 +++++++++-----
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/vello/src/render.rs b/vello/src/render.rs
index 1be4c18bd..293884c77 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -457,7 +457,7 @@ impl Render {
         recording.dispatch(
             shaders.path_tiling_setup,
             wg_counts.path_tiling_setup,
-            [bump_buf, indirect_count_buf.into(), ptcl_buf],
+            [config_buf, bump_buf, indirect_count_buf.into(), ptcl_buf],
         );
         recording.dispatch_indirect(
             shaders.path_tiling,
diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs
index 1124888ef..147d3cb14 100644
--- a/vello/src/shaders.rs
+++ b/vello/src/shaders.rs
@@ -196,7 +196,7 @@ pub(crate) fn full_shaders(
             Buffer,
         ]
     );
-    let path_tiling_setup = add_shader!(path_tiling_setup, [Buffer, Buffer, Buffer]);
+    let path_tiling_setup = add_shader!(path_tiling_setup, [Uniform, Buffer, Buffer, Buffer]);
     let path_tiling = add_shader!(
         path_tiling,
         [
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index a829405f9..cc4c80028 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -90,12 +90,8 @@ fn write_path(tile: Tile, tile_ix: u32, draw_flags: u32) {
     // fine).
     let n_segs = tile.segment_count_or_ix;
     if n_segs != 0u {
+        // We check for overflow of bump.segments in path_tiling_setup
         var seg_ix = atomicAdd(&bump.segments, n_segs);
-        if seg_ix > config.segments_size {
-            // All writes into segments happen in path_tiling, so we don't need
-            // to avoid UB in this shader
-            atomicOr(&bump.failed, STAGE_COARSE);
-        }
         tiles[tile_ix].segment_count_or_ix = ~seg_ix;
         alloc_cmd(4u);
         ptcl[cmd_offset] = CMD_FILL;
diff --git a/vello_shaders/shader/path_tiling_setup.wgsl b/vello_shaders/shader/path_tiling_setup.wgsl
index 4d5bf2e30..7843c9d84 100644
--- a/vello_shaders/shader/path_tiling_setup.wgsl
+++ b/vello_shaders/shader/path_tiling_setup.wgsl
@@ -3,15 +3,19 @@
 
 // Set up dispatch size for path tiling stage.
 
+#import config
 #import bump
 
 @group(0) @binding(0)
-var<storage, read_write> bump: BumpAllocators;
+var<uniform> config: Config;
 
 @group(0) @binding(1)
-var<storage, read_write> indirect: IndirectCount;
+var<storage, read_write> bump: BumpAllocators;
 
 @group(0) @binding(2)
+var<storage, read_write> indirect: IndirectCount;
+
+@group(0) @binding(3)
 var<storage, read_write> ptcl: array<u32>;
 
 // Partition size for path tiling stage
@@ -19,14 +23,21 @@ let WG_SIZE = 256u;
 
 @compute @workgroup_size(1)
 fn main() {
-    if atomicLoad(&bump.failed) != 0u {
+    indirect.count_y = 1u;
+    indirect.count_z = 1u;
+    let segments = atomicLoad(&bump.seg_counts);
+    let overflowed = segments > config.segments_size;
+    // If segments == config.segments_size, that's not an overflow
+    if atomicLoad(&bump.failed) != 0u || overflowed {
+        if overflowed {
+            // Report the failure so that the CPU and `prepare` know that we've failed
+            atomicOr(&bump.failed, STAGE_COARSE);
+        }
+        // Cancel path_tiling
         indirect.count_x = 0u;
         // signal fine rasterizer that failure happened (it doesn't bind bump)
         ptcl[0] = ~0u;
     } else {
-        let segments = atomicLoad(&bump.seg_counts);
         indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE;
     }
-    indirect.count_y = 1u;
-    indirect.count_z = 1u;
 }
diff --git a/vello_shaders/src/cpu/path_tiling_setup.rs b/vello_shaders/src/cpu/path_tiling_setup.rs
index 9b6303691..e087feba2 100644
--- a/vello_shaders/src/cpu/path_tiling_setup.rs
+++ b/vello_shaders/src/cpu/path_tiling_setup.rs
@@ -1,13 +1,18 @@
 // Copyright 2023 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
-use vello_encoding::{BumpAllocators, IndirectCount};
+use vello_encoding::{BumpAllocators, ConfigUniform, IndirectCount};
 
 use super::CpuBinding;
 
 const WG_SIZE: usize = 256;
 
-fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
+fn path_tiling_setup_main(
+    // Config is needed to detect allocation failure
+    _config: &ConfigUniform,
+    bump: &BumpAllocators,
+    indirect: &mut IndirectCount,
+) {
     let segments = bump.seg_counts;
     indirect.count_x = (segments + (WG_SIZE as u32 - 1)) / WG_SIZE as u32;
     indirect.count_y = 1;
@@ -15,8 +20,9 @@ fn path_tiling_setup_main(bump: &BumpAllocators, indirect: &mut IndirectCount) {
 }
 
 pub fn path_tiling_setup(_n_wg: u32, resources: &[CpuBinding]) {
-    let bump = resources[0].as_typed();
-    let mut indirect = resources[1].as_typed_mut();
+    let config = resources[0].as_typed();
+    let bump = resources[1].as_typed();
+    let mut indirect = resources[2].as_typed_mut();
     // binding 2 is ptcl, which we would need if we propagate failure
-    path_tiling_setup_main(&bump, &mut indirect);
+    path_tiling_setup_main(&config, &bump, &mut indirect);
 }

From cf62daeade62489811c7eb84dc2ac1509b734778 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 3 Jul 2024 11:01:30 +0100
Subject: [PATCH 12/18] Fix compilation of bump estimator

---
 vello/src/scene.rs               | 4 ++--
 vello_encoding/src/estimate.rs   | 7 +++----
 vello_shaders/src/cpu/prepare.rs | 3 +++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vello/src/scene.rs b/vello/src/scene.rs
index 103e9b222..594ec3ff5 100644
--- a/vello/src/scene.rs
+++ b/vello/src/scene.rs
@@ -14,7 +14,7 @@ use skrifa::{
     GlyphId, MetadataProvider, OutlineGlyphCollection,
 };
 #[cfg(feature = "bump_estimate")]
-use vello_encoding::BumpAllocatorMemory;
+use vello_encoding::BumpAllocators;
 use vello_encoding::{Encoding, Glyph, GlyphRun, Patch, Transform};
 
 // TODO - Document invariants and edge cases (#470)
@@ -49,7 +49,7 @@ impl Scene {
     /// Tally up the bump allocator estimate for the current state of the encoding,
     /// taking into account an optional `transform` applied to the entire scene.
     #[cfg(feature = "bump_estimate")]
-    pub fn bump_estimate(&self, transform: Option<Affine>) -> BumpAllocatorMemory {
+    pub fn bump_estimate(&self, transform: Option<Affine>) -> BumpAllocators {
         self.estimator
             .tally(transform.as_ref().map(Transform::from_kurbo).as_ref())
     }
diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs
index 62b943013..6d1e87008 100644
--- a/vello_encoding/src/estimate.rs
+++ b/vello_encoding/src/estimate.rs
@@ -162,7 +162,7 @@ impl BumpEstimator {
     }
 
     /// Produce the final total, applying an optional transform to all content.
-    pub fn tally(&self, transform: Option<&Transform>) -> BumpAllocatorMemory {
+    pub fn tally(&self, transform: Option<&Transform>) -> BumpAllocators {
         let scale = transform_scale(transform);
 
         // The post-flatten line estimate.
@@ -172,7 +172,7 @@ impl BumpEstimator {
         // segments as there are lines, in case `segments` was underestimated at small scales.
         let n_segments = ((self.segments as f64 * scale).ceil() as u32).max(lines);
 
-        let bump = BumpAllocators {
+        BumpAllocators {
             failed: 0,
             // TODO: we can provide a tighter bound here but for now we
             // assume that binning must be bounded by the segment count.
@@ -183,8 +183,7 @@ impl BumpEstimator {
             seg_counts: n_segments,
             segments: n_segments,
             lines,
-        };
-        bump.memory()
+        }
     }
 
     fn count_stroke_caps(&mut self, style: Cap, scaled_width: f64, count: u32) {
diff --git a/vello_shaders/src/cpu/prepare.rs b/vello_shaders/src/cpu/prepare.rs
index 26feaf79a..306f05f9e 100644
--- a/vello_shaders/src/cpu/prepare.rs
+++ b/vello_shaders/src/cpu/prepare.rs
@@ -1,3 +1,6 @@
+// Copyright 2023 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
 use vello_encoding::{BumpAllocators, ConfigUniform};
 
 use super::CpuBinding;

From f504d1295cc933a23f5ae359d25b6c51fe5c74cf Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 3 Jul 2024 11:11:35 +0100
Subject: [PATCH 13/18] Fix incorrect wgpu mixing

---
 vello/src/lib.rs               | 7 +++++--
 vello_encoding/src/estimate.rs | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index a942660bf..eeed381d3 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -128,7 +128,6 @@ pub use shaders::FullShaders;
 
 #[cfg(feature = "wgpu")]
 use vello_encoding::Resolver;
-use wgpu::SubmissionIndex;
 #[cfg(feature = "wgpu")]
 use wgpu_engine::{ExternalResource, WgpuEngine};
 
@@ -136,7 +135,10 @@ pub use debug::DebugLayers;
 /// Temporary export, used in `with_winit` for stats
 pub use vello_encoding::BumpAllocators;
 #[cfg(feature = "wgpu")]
-use wgpu::{Buffer, BufferUsages, Device, Queue, SurfaceTexture, TextureFormat, TextureView};
+use wgpu::{
+    Buffer, BufferUsages, Device, PipelineCompilationOptions, Queue, SubmissionIndex,
+    SurfaceTexture, TextureFormat, TextureView,
+};
 #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))]
 use wgpu_profiler::{GpuProfiler, GpuProfilerSettings};
 
@@ -247,6 +249,7 @@ pub enum Error {
 #[allow(dead_code)] // this can be unused when wgpu feature is not used
 pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
 
+#[cfg(feature = "wgpu")]
 type BumpSubmission = (SubmissionIndex, Buffer, Arc<AtomicBool>);
 
 /// Renders a scene into a texture or surface.
diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs
index 6d1e87008..c36497d11 100644
--- a/vello_encoding/src/estimate.rs
+++ b/vello_encoding/src/estimate.rs
@@ -4,7 +4,7 @@
 //! This utility provides conservative size estimation for buffer allocations backing
 //! GPU bump memory. This estimate relies on heuristics and naturally overestimates.
 
-use super::{BumpAllocatorMemory, BumpAllocators, Transform};
+use super::{BumpAllocators, Transform};
 use peniko::kurbo::{Cap, Join, PathEl, Point, Stroke, Vec2};
 
 const RSQRT_OF_TOL: f64 = 2.2360679775; // tol = 0.2

From 692c7f82cc22b18772660af3d355fbd2ab868122 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 3 Jul 2024 11:14:54 +0100
Subject: [PATCH 14/18] Use the initial sizes to let tests work

---
 vello/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index eeed381d3..9b56d06a4 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -370,7 +370,7 @@ impl Renderer {
             bump: None,
             previous_submission: None,
             previouser_submission: None,
-            bump_sizes: BumpAllocators::default(),
+            bump_sizes: BumpAllocators::initial_sizes(),
             // Use 3 pending frames
             #[cfg(feature = "wgpu-profiler")]
             profiler: GpuProfiler::new(GpuProfilerSettings {

From 7d9c609fc1f33af0c28565d69907d9a29c7a20ec Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:34:23 +0100
Subject: [PATCH 15/18] Save some rebase cleanup

---
 vello/src/lib.rs                  | 8 +++++---
 vello/src/render.rs               | 7 ++++---
 vello_shaders/shader/coarse.wgsl  | 2 +-
 vello_shaders/shader/prepare.wgsl | 6 ++++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 9b56d06a4..bb39c4240 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -136,8 +136,8 @@ pub use debug::DebugLayers;
 pub use vello_encoding::BumpAllocators;
 #[cfg(feature = "wgpu")]
 use wgpu::{
-    Buffer, BufferUsages, Device, PipelineCompilationOptions, Queue, SubmissionIndex,
-    SurfaceTexture, TextureFormat, TextureView,
+    Buffer, BufferUsages, Device, Queue, SubmissionIndex, SurfaceTexture, TextureFormat,
+    TextureView,
 };
 #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))]
 use wgpu_profiler::{GpuProfiler, GpuProfilerSettings};
@@ -491,7 +491,7 @@ impl Renderer {
             .blit
             .as_ref()
             .expect("renderer should have configured surface_format to use on a surface");
-        let mut encoder =
+        let encoder =
             device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
         let mut recording = Recording::default();
         let target_proxy = ImageProxy::new(width, height, ImageFormat::from_wgpu(target.format));
@@ -526,6 +526,8 @@ impl Renderer {
             &mut self.profiler,
         )?;
         #[cfg(feature = "wgpu-profiler")]
+        let mut encoder = encoder;
+        #[cfg(feature = "wgpu-profiler")]
         self.profiler.resolve_queries(&mut encoder);
         if let Some(download) = &bump_download {
             let completed = buffer_completed.clone();
diff --git a/vello/src/render.rs b/vello/src/render.rs
index 293884c77..a79951be2 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -103,7 +103,7 @@ pub(crate) fn render_encoding_full(
 ) -> (Recording, ImageProxy, BufferProxy) {
     let mut render = Render::new();
     let mut recording =
-        render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, true);
+        render.render_encoding_coarse(encoding, resolver, shaders, params, bump_sizes, false);
     let out_image = render.out_image();
     let bump_buf = render.bump_buf();
     render.record_fine(shaders, &mut recording);
@@ -497,8 +497,8 @@ impl Render {
             image_atlas: ResourceProxy::Image(image_atlas),
             out_image,
         });
-        // TODO: This second check is a massive hack
-        if robust && !shaders.pathtag_is_cpu {
+        // TODO: This check is a massive hack to disable robustness if
+        if !shaders.pathtag_is_cpu {
             recording.download(*bump_buf.as_buf().unwrap());
         }
         recording.free_resource(bump_buf);
@@ -595,6 +595,7 @@ impl Render {
         recording.free_resource(fine.gradient_image);
         recording.free_resource(fine.image_atlas);
         recording.free_resource(fine.info_bin_data_buf);
+        recording.free_resource(fine.blend_spill_buf);
         // TODO: make mask buf persistent
         if let Some(mask_buf) = self.mask_buf.take() {
             recording.free_resource(mask_buf);
diff --git a/vello_shaders/shader/coarse.wgsl b/vello_shaders/shader/coarse.wgsl
index cc4c80028..8048e07b1 100644
--- a/vello_shaders/shader/coarse.wgsl
+++ b/vello_shaders/shader/coarse.wgsl
@@ -444,7 +444,7 @@ fn main(
         var blend_ix = 0u;
         if max_blend_depth > BLEND_STACK_SPLIT {
             let scratch_size = (max_blend_depth - BLEND_STACK_SPLIT) * TILE_WIDTH * TILE_HEIGHT;
-            blend_ix = atomicAdd(&bump.blend, scratch_size);
+            blend_ix = atomicAdd(&bump.blend_spill, scratch_size);
             if blend_ix + scratch_size > config.blend_size {
                 atomicOr(&bump.failed, STAGE_COARSE);
             }
diff --git a/vello_shaders/shader/prepare.wgsl b/vello_shaders/shader/prepare.wgsl
index 01006afd6..530d43362 100644
--- a/vello_shaders/shader/prepare.wgsl
+++ b/vello_shaders/shader/prepare.wgsl
@@ -50,7 +50,9 @@ fn main() {
         if config.lines_size < atomicLoad(&bump.lines) {
             should_cancel = true;
         }
-        // config.blend_size < atomicLoad(&bump.blend)
+        if config.blend_size < atomicLoad(&bump.blend_spill) {
+            should_cancel = true;
+        }
         if should_cancel {
             // Then don't run this frame
             config.cancelled = 1u;
@@ -64,6 +66,6 @@ fn main() {
     atomicStore(&bump.tile, 0u);
     atomicStore(&bump.seg_counts, 0u);
     atomicStore(&bump.segments, 0u);
-    atomicStore(&bump.blend, 0u);
+    atomicStore(&bump.blend_spill, 0u);
     atomicStore(&bump.lines, 0u);
 }

From d2e97822df20a7a83a882fca6d67af2b221318be Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:36:25 +0100
Subject: [PATCH 16/18] Properly reallocate blend spill

---
 vello/src/lib.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index bb39c4240..797139831 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -668,6 +668,17 @@ impl Renderer {
                                 );
                                 self.bump_sizes.segments = new_size;
                             }
+                            if data.blend_spill > self.bump_sizes.blend_spill {
+                                changed = true;
+                                let new_size = data.blend_spill * 5 / 4;
+                                log::debug!(
+                                    "Resizing blend_spill to {:?} (Needed {:?}, had {:?})",
+                                    new_size,
+                                    data.blend_spill,
+                                    self.bump_sizes.blend_spill,
+                                );
+                                self.bump_sizes.blend_spill = new_size;
+                            }
                             if !changed {
                                 log::warn!("Detected need for reallocation, but didn't reallocate {:x?}. Data {data:?}", data.failed);
                             } else {

From d34f3db24ef88c21adc5f00dc3672459f688d9fb Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:07:29 +0100
Subject: [PATCH 17/18] If allocation fails, don't try and download a fake
 lines buffer

---
 vello/src/lib.rs | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 797139831..2fbcee784 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -894,17 +894,18 @@ impl Renderer {
                     .as_ref()
                     .expect("renderer should have configured surface_format to use on a surface");
                 let bump = result.bump.as_ref().unwrap();
-                // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset.
-                let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?;
-                debug.render(
-                    &mut recording,
-                    surface_proxy,
-                    &captured,
-                    bump,
-                    params,
-                    &downloads,
-                );
-
+                if bump.failed == 0 {
+                    // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset.
+                    let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?;
+                    debug.render(
+                        &mut recording,
+                        surface_proxy,
+                        &captured,
+                        bump,
+                        params,
+                        &downloads,
+                    );
+                }
                 // TODO: this sucks. better to release everything in a helper
                 // TODO: it would be much better to have a way to safely destroy a buffer.
                 self.engine.free_download(captured.lines);

From 83a5c286d412dc5476f8f10522abade57c9ba8c4 Mon Sep 17 00:00:00 2001
From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:11:47 +0100
Subject: [PATCH 18/18] Fix clippy issues with features

---
 vello/src/render.rs            | 2 +-
 vello_encoding/src/estimate.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vello/src/render.rs b/vello/src/render.rs
index a79951be2..292a891d4 100644
--- a/vello/src/render.rs
+++ b/vello/src/render.rs
@@ -138,7 +138,7 @@ impl Render {
         shaders: &FullShaders,
         params: &RenderParams,
         bump_sizes: BumpAllocators,
-        robust: bool,
+        #[cfg_attr(not(feature = "debug_layers"), allow(unused))] robust: bool,
     ) -> Recording {
         use vello_encoding::RenderConfig;
         let mut recording = Recording::default();
diff --git a/vello_encoding/src/estimate.rs b/vello_encoding/src/estimate.rs
index c36497d11..7636401ab 100644
--- a/vello_encoding/src/estimate.rs
+++ b/vello_encoding/src/estimate.rs
@@ -179,7 +179,7 @@ impl BumpEstimator {
             binning: n_segments,
             ptcl: 0,
             tile: 0,
-            blend: 0,
+            blend_spill: 0,
             seg_counts: n_segments,
             segments: n_segments,
             lines,