diff --git a/src/lib.rs b/src/lib.rs
index 006accd3c..c68997cc3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -116,11 +116,8 @@ pub struct RendererOptions {
 impl Renderer {
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, render_options: &RendererOptions) -> Result<Self> {
-        let mut engine = WgpuEngine::new();
-        let mut shaders = shaders::full_shaders(device, &mut engine)?;
-        if render_options.use_cpu {
-            shaders.install_cpu_shaders(&mut engine);
-        }
+        let mut engine = WgpuEngine::new(render_options.use_cpu);
+        let shaders = shaders::full_shaders(device, &mut engine)?;
         let blit = render_options
             .surface_format
             .map(|surface_format| BlitPipeline::new(device, surface_format));
@@ -240,11 +237,8 @@ impl Renderer {
     #[cfg(feature = "hot_reload")]
     pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> {
         device.push_error_scope(wgpu::ErrorFilter::Validation);
-        let mut engine = WgpuEngine::new();
-        let mut shaders = shaders::full_shaders(device, &mut engine)?;
-        if self.use_cpu {
-            shaders.install_cpu_shaders(&mut engine);
-        }
+        let mut engine = WgpuEngine::new(self.use_cpu);
+        let shaders = shaders::full_shaders(device, &mut engine)?;
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
             return Err(error.into());
diff --git a/src/shaders.rs b/src/shaders.rs
index 668dafac4..56febebe3 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -86,6 +86,7 @@ pub struct FullShaders {
 
 #[cfg(feature = "wgpu")]
 pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShaders, Error> {
+    use crate::wgpu_engine::CpuShaderType;
     use crate::ANTIALIASING;
 
     let imports = SHARED_SHADERS
@@ -114,12 +115,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
         "pathtag_reduce",
         preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(),
         &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::pathtag_reduce),
     )?;
     let pathtag_reduce2 = engine.add_shader(
         device,
         "pathtag_reduce2",
         preprocess::preprocess(shader!("pathtag_reduce2"), &full_config, &imports).into(),
         &[BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Ununsed,
     )?;
     let pathtag_scan1 = engine.add_shader(
         device,
@@ -130,6 +133,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::BufReadOnly,
             BindType::Buffer,
         ],
+        CpuShaderType::Ununsed,
     )?;
     let pathtag_scan = engine.add_shader(
         device,
@@ -141,6 +145,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::BufReadOnly,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::pathtag_scan),
     )?;
     let pathtag_scan_large = engine.add_shader(
         device,
@@ -152,12 +157,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::BufReadOnly,
             BindType::Buffer,
         ],
+        CpuShaderType::Ununsed,
     )?;
     let bbox_clear = engine.add_shader(
         device,
         "bbox_clear",
         preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
         &[BindType::Uniform, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::bbox_clear),
     )?;
     let flatten = engine.add_shader(
         device,
@@ -171,12 +178,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::flatten),
     )?;
     let draw_reduce = engine.add_shader(
         device,
         "draw_reduce",
         preprocess::preprocess(shader!("draw_reduce"), &empty, &imports).into(),
         &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::draw_reduce),
     )?;
     let draw_leaf = engine.add_shader(
         device,
@@ -191,6 +200,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::draw_leaf),
     )?;
     let clip_reduce = engine.add_shader(
         device,
@@ -202,6 +212,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::clip_reduce),
     )?;
     let clip_leaf = engine.add_shader(
         device,
@@ -216,6 +227,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::clip_leaf),
     )?;
     let binning = engine.add_shader(
         device,
@@ -231,6 +243,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::binning),
     )?;
     let tile_alloc = engine.add_shader(
         device,
@@ -244,12 +257,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::tile_alloc),
     )?;
     let path_count_setup = engine.add_shader(
         device,
         "path_count_setup",
         preprocess::preprocess(shader!("path_count_setup"), &empty, &imports).into(),
         &[BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::path_count_setup),
     )?;
     let path_count = engine.add_shader(
         device,
@@ -263,12 +278,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::path_count),
     )?;
     let backdrop = engine.add_shader(
         device,
         "backdrop_dyn",
         preprocess::preprocess(shader!("backdrop_dyn"), &empty, &imports).into(),
         &[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::backdrop),
     )?;
     let coarse = engine.add_shader(
         device,
@@ -285,12 +302,14 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::Buffer,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::coarse),
     )?;
     let path_tiling_setup = engine.add_shader(
         device,
         "path_tiling_setup",
         preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(),
         &[BindType::BufReadOnly, BindType::Buffer],
+        CpuShaderType::Present(cpu_shader::path_tiling_setup),
     )?;
     let path_tiling = engine.add_shader(
         device,
@@ -304,6 +323,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             BindType::BufReadOnly,
             BindType::Buffer,
         ],
+        CpuShaderType::Present(cpu_shader::path_tiling),
     )?;
     let fine = match ANTIALIASING {
         crate::AaConfig::Area => engine.add_shader(
@@ -319,6 +339,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
                 BindType::ImageRead(ImageFormat::Rgba8),
                 BindType::ImageRead(ImageFormat::Rgba8),
             ],
+            CpuShaderType::Missing,
         )?,
         _ => {
             engine.add_shader(
@@ -335,6 +356,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
                     BindType::ImageRead(ImageFormat::Rgba8),
                     BindType::BufReadOnly, // mask buffer
                 ],
+                CpuShaderType::Missing,
             )?
         }
     };
@@ -359,42 +381,10 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
         path_tiling_setup,
         path_tiling,
         fine,
-        pathtag_is_cpu: false,
+        pathtag_is_cpu: engine.use_cpu,
     })
 }
 
-#[cfg(feature = "wgpu")]
-impl FullShaders {
-    /// Install the CPU shaders.
-    ///
-    /// There are a couple things to note here. The granularity provided by
-    /// this method is coarse; it installs all the shaders. There are many
-    /// use cases (including debugging), where a mix is desired, or the
-    /// choice between GPU and CPU dispatch might be dynamic.
-    ///
-    /// Second, the actual mapping to CPU shaders is not really specific to
-    /// the engine, and should be split out into a back-end agnostic struct.
-    pub fn install_cpu_shaders(&mut self, engine: &mut WgpuEngine) {
-        engine.set_cpu_shader(self.pathtag_reduce, cpu_shader::pathtag_reduce);
-        engine.set_cpu_shader(self.pathtag_scan, cpu_shader::pathtag_scan);
-        engine.set_cpu_shader(self.bbox_clear, cpu_shader::bbox_clear);
-        engine.set_cpu_shader(self.flatten, cpu_shader::flatten);
-        engine.set_cpu_shader(self.draw_reduce, cpu_shader::draw_reduce);
-        engine.set_cpu_shader(self.draw_leaf, cpu_shader::draw_leaf);
-        engine.set_cpu_shader(self.clip_reduce, cpu_shader::clip_reduce);
-        engine.set_cpu_shader(self.clip_leaf, cpu_shader::clip_leaf);
-        engine.set_cpu_shader(self.binning, cpu_shader::binning);
-        engine.set_cpu_shader(self.tile_alloc, cpu_shader::tile_alloc);
-        engine.set_cpu_shader(self.path_count_setup, cpu_shader::path_count_setup);
-        engine.set_cpu_shader(self.path_count, cpu_shader::path_count);
-        engine.set_cpu_shader(self.backdrop, cpu_shader::backdrop);
-        engine.set_cpu_shader(self.coarse, cpu_shader::coarse);
-        engine.set_cpu_shader(self.path_tiling_setup, cpu_shader::path_tiling_setup);
-        engine.set_cpu_shader(self.path_tiling, cpu_shader::path_tiling);
-        self.pathtag_is_cpu = true;
-    }
-}
-
 macro_rules! shared_shader {
     ($name:expr) => {
         (
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index c5359c1bb..dfd20aa87 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -25,13 +25,45 @@ pub struct WgpuEngine {
     pool: ResourcePool,
     bind_map: BindMap,
     downloads: HashMap<Id, Buffer>,
+    pub(crate) use_cpu: bool,
 }
 
-struct Shader {
+struct WgpuShader {
     pipeline: ComputePipeline,
     bind_group_layout: BindGroupLayout,
+}
+
+pub enum CpuShaderType {
+    Present(fn(u32, &[CpuBinding])),
+    Missing,
+    Ununsed,
+}
+
+struct CpuShader {
+    shader: fn(u32, &[CpuBinding]),
+}
+
+enum ShaderKind<'a> {
+    Wgpu(&'a WgpuShader),
+    Cpu(&'a CpuShader),
+}
+
+struct Shader {
     label: &'static str,
-    cpu_shader: Option<fn(u32, &[CpuBinding])>,
+    wgpu: Option<WgpuShader>,
+    cpu: Option<CpuShader>,
+}
+
+impl Shader {
+    fn select(&self) -> ShaderKind {
+        if let Some(cpu) = self.cpu.as_ref() {
+            ShaderKind::Cpu(cpu)
+        } else if let Some(wgpu) = self.wgpu.as_ref() {
+            ShaderKind::Wgpu(wgpu)
+        } else {
+            panic!("no available shader")
+        }
+    }
 }
 
 pub enum ExternalResource<'a> {
@@ -90,8 +122,11 @@ enum TransientBuf<'a> {
 }
 
 impl WgpuEngine {
-    pub fn new() -> WgpuEngine {
-        Default::default()
+    pub fn new(use_cpu: bool) -> WgpuEngine {
+        Self {
+            use_cpu,
+            ..Default::default()
+        }
     }
 
     /// Add a shader.
@@ -107,7 +142,36 @@ impl WgpuEngine {
         label: &'static str,
         wgsl: Cow<'static, str>,
         layout: &[BindType],
+        cpu_shader: CpuShaderType,
     ) -> Result<ShaderId, Error> {
+        let mut add = |shader| {
+            let id = self.shaders.len();
+            self.shaders.push(shader);
+            Ok(ShaderId(id))
+        };
+
+        if self.use_cpu {
+            match cpu_shader {
+                CpuShaderType::Present(shader) => {
+                    return add(Shader {
+                        wgpu: None,
+                        cpu: Some(CpuShader { shader }),
+                        label,
+                    });
+                }
+                // This shader is unused in CPU mode, create a dummy shader
+                CpuShaderType::Ununsed => {
+                    return add(Shader {
+                        wgpu: None,
+                        cpu: None,
+                        label,
+                    });
+                }
+                // Create a GPU shader as we don't have a CPU shader
+                CpuShaderType::Missing => {}
+            }
+        }
+
         let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
             label: Some(label),
             source: wgpu::ShaderSource::Wgsl(wgsl),
@@ -176,20 +240,14 @@ impl WgpuEngine {
             module: &shader_module,
             entry_point: "main",
         });
-        let cpu_shader = None;
-        let shader = Shader {
-            pipeline,
-            bind_group_layout,
+        add(Shader {
+            wgpu: Some(WgpuShader {
+                pipeline,
+                bind_group_layout,
+            }),
+            cpu: None,
             label,
-            cpu_shader,
-        };
-        let id = self.shaders.len();
-        self.shaders.push(shader);
-        Ok(ShaderId(id))
-    }
-
-    pub fn set_cpu_shader(&mut self, id: ShaderId, f: fn(u32, &[CpuBinding])) {
-        self.shaders[id.0].cpu_shader = Some(f);
+        })
     }
 
     pub fn run_recording(
@@ -318,82 +376,88 @@ impl WgpuEngine {
                 Command::Dispatch(shader_id, wg_size, bindings) => {
                     // println!("dispatching {:?} with {} bindings", wg_size, bindings.len());
                     let shader = &self.shaders[shader_id.0];
-                    if let Some(cpu_shader) = shader.cpu_shader {
-                        // The current strategy is to run the CPU shader synchronously. This
-                        // works because there is currently the added constraint that data
-                        // can only flow from CPU to GPU, not the other way around. If and
-                        // when we implement that, we will need to defer the execution. Of
-                        // course, we will also need to wire up more async sychronization
-                        // mechanisms, as the CPU dispatch can't run until the preceding
-                        // command buffer submission completes (and, in WebGPU, the async
-                        // mapping operations on the buffers completes).
-                        let resources =
-                            transient_map.create_cpu_resources(&mut self.bind_map, bindings);
-                        cpu_shader(wg_size.0, &resources);
-                    } else {
-                        let bind_group = transient_map.create_bind_group(
-                            &mut self.bind_map,
-                            &mut self.pool,
-                            device,
-                            queue,
-                            &mut encoder,
-                            &shader.bind_group_layout,
-                            bindings,
-                        )?;
-                        let mut cpass = encoder.begin_compute_pass(&Default::default());
-                        #[cfg(feature = "wgpu-profiler")]
-                        profiler.begin_scope(shader.label, &mut cpass, device);
-                        cpass.set_pipeline(&shader.pipeline);
-                        cpass.set_bind_group(0, &bind_group, &[]);
-                        cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2);
-                        #[cfg(feature = "wgpu-profiler")]
-                        profiler.end_scope(&mut cpass);
+                    match shader.select() {
+                        ShaderKind::Cpu(cpu_shader) => {
+                            // The current strategy is to run the CPU shader synchronously. This
+                            // works because there is currently the added constraint that data
+                            // can only flow from CPU to GPU, not the other way around. If and
+                            // when we implement that, we will need to defer the execution. Of
+                            // course, we will also need to wire up more async sychronization
+                            // mechanisms, as the CPU dispatch can't run until the preceding
+                            // command buffer submission completes (and, in WebGPU, the async
+                            // mapping operations on the buffers completes).
+                            let resources =
+                                transient_map.create_cpu_resources(&mut self.bind_map, bindings);
+                            (cpu_shader.shader)(wg_size.0, &resources);
+                        }
+                        ShaderKind::Wgpu(wgpu_shader) => {
+                            let bind_group = transient_map.create_bind_group(
+                                &mut self.bind_map,
+                                &mut self.pool,
+                                device,
+                                queue,
+                                &mut encoder,
+                                &wgpu_shader.bind_group_layout,
+                                bindings,
+                            )?;
+                            let mut cpass = encoder.begin_compute_pass(&Default::default());
+                            #[cfg(feature = "wgpu-profiler")]
+                            profiler.begin_scope(shader.label, &mut cpass, device);
+                            cpass.set_pipeline(&wgpu_shader.pipeline);
+                            cpass.set_bind_group(0, &bind_group, &[]);
+                            cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2);
+                            #[cfg(feature = "wgpu-profiler")]
+                            profiler.end_scope(&mut cpass);
+                        }
                     }
                 }
                 Command::DispatchIndirect(shader_id, proxy, offset, bindings) => {
                     let shader = &self.shaders[shader_id.0];
-                    if let Some(cpu_shader) = shader.cpu_shader {
-                        // Same consideration as above about running the CPU shader synchronously.
-                        let n_wg;
-                        if let CpuBinding::BufferRW(b) = self.bind_map.get_cpu_buf(proxy.id) {
-                            let slice = b.borrow();
-                            let indirect: &[u32] = bytemuck::cast_slice(&slice);
-                            n_wg = indirect[0];
-                        } else {
-                            panic!("indirect buffer missing from bind map");
+                    match shader.select() {
+                        ShaderKind::Cpu(cpu_shader) => {
+                            // Same consideration as above about running the CPU shader synchronously.
+                            let n_wg;
+                            if let CpuBinding::BufferRW(b) = self.bind_map.get_cpu_buf(proxy.id) {
+                                let slice = b.borrow();
+                                let indirect: &[u32] = bytemuck::cast_slice(&slice);
+                                n_wg = indirect[0];
+                            } else {
+                                panic!("indirect buffer missing from bind map");
+                            }
+                            let resources =
+                                transient_map.create_cpu_resources(&mut self.bind_map, bindings);
+                            (cpu_shader.shader)(n_wg, &resources);
+                        }
+                        ShaderKind::Wgpu(wgpu_shader) => {
+                            let bind_group = transient_map.create_bind_group(
+                                &mut self.bind_map,
+                                &mut self.pool,
+                                device,
+                                queue,
+                                &mut encoder,
+                                &wgpu_shader.bind_group_layout,
+                                bindings,
+                            )?;
+                            transient_map.materialize_gpu_buf_for_indirect(
+                                &mut self.bind_map,
+                                &mut self.pool,
+                                device,
+                                queue,
+                                proxy,
+                            );
+                            let mut cpass = encoder.begin_compute_pass(&Default::default());
+                            #[cfg(feature = "wgpu-profiler")]
+                            profiler.begin_scope(shader.label, &mut cpass, device);
+                            cpass.set_pipeline(&wgpu_shader.pipeline);
+                            cpass.set_bind_group(0, &bind_group, &[]);
+                            let buf = self
+                                .bind_map
+                                .get_gpu_buf(proxy.id)
+                                .ok_or("buffer for indirect dispatch not in map")?;
+                            cpass.dispatch_workgroups_indirect(buf, *offset);
+                            #[cfg(feature = "wgpu-profiler")]
+                            profiler.end_scope(&mut cpass);
                         }
-                        let resources =
-                            transient_map.create_cpu_resources(&mut self.bind_map, bindings);
-                        cpu_shader(n_wg, &resources);
-                    } else {
-                        let bind_group = transient_map.create_bind_group(
-                            &mut self.bind_map,
-                            &mut self.pool,
-                            device,
-                            queue,
-                            &mut encoder,
-                            &shader.bind_group_layout,
-                            bindings,
-                        )?;
-                        transient_map.materialize_gpu_buf_for_indirect(
-                            &mut self.bind_map,
-                            &mut self.pool,
-                            device,
-                            queue,
-                            proxy,
-                        );
-                        let mut cpass = encoder.begin_compute_pass(&Default::default());
-                        #[cfg(feature = "wgpu-profiler")]
-                        profiler.begin_scope(shader.label, &mut cpass, device);
-                        cpass.set_pipeline(&shader.pipeline);
-                        cpass.set_bind_group(0, &bind_group, &[]);
-                        let buf = self
-                            .bind_map
-                            .get_gpu_buf(proxy.id)
-                            .ok_or("buffer for indirect dispatch not in map")?;
-                        cpass.dispatch_workgroups_indirect(buf, *offset);
-                        #[cfg(feature = "wgpu-profiler")]
-                        profiler.end_scope(&mut cpass);
                     }
                 }
                 Command::Download(proxy) => {