Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare example to isolate shader miscompilation #199

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 8 additions & 79 deletions piet-wgsl/shader/tile_alloc.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -14,101 +14,30 @@
//
// Also licensed under MIT license, at your choice.

// Tile allocation (and zeroing of tiles)

#import config
#import bump
#import drawtag
#import tile

@group(0) @binding(0)
var<storage> config: Config;
var<storage, read_write> bump: atomic<u32>;

@group(0) @binding(1)
var<storage> scene: array<u32>;

@group(0) @binding(2)
var<storage> draw_bboxes: array<vec4<f32>>;

@group(0) @binding(3)
var<storage, read_write> bump: BumpAllocators;

@group(0) @binding(4)
var<storage, read_write> paths: array<Path>;

@group(0) @binding(5)
var<storage, read_write> tiles: array<Tile>;
var<storage, read_write> paths: array<u32>;

let WG_SIZE = 256u;

var<workgroup> sh_tile_count: array<u32, WG_SIZE>;
var<workgroup> sh_tile_offset: u32;

@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
) {
// scale factors useful for converting coordinates to tiles
// TODO: make into constants
let SX = 1.0 / f32(TILE_WIDTH);
let SY = 1.0 / f32(TILE_HEIGHT);

let drawobj_ix = global_id.x;
var drawtag = DRAWTAG_NOP;
if drawobj_ix < config.n_drawobj {
drawtag = scene[config.drawtag_base + drawobj_ix];
}
var x0 = 0;
var y0 = 0;
var x1 = 0;
var y1 = 0;
if drawtag != DRAWTAG_NOP && drawtag != DRAWTAG_END_CLIP {
let bbox = draw_bboxes[drawobj_ix];
x0 = i32(floor(bbox.x * SX));
y0 = i32(floor(bbox.y * SY));
x1 = i32(ceil(bbox.z * SX));
y1 = i32(ceil(bbox.w * SY));
}
let ux0 = u32(clamp(x0, 0, i32(config.width_in_tiles)));
let uy0 = u32(clamp(y0, 0, i32(config.height_in_tiles)));
let ux1 = u32(clamp(x1, 0, i32(config.width_in_tiles)));
let uy1 = u32(clamp(y1, 0, i32(config.height_in_tiles)));
let tile_count = (ux1 - ux0) * (uy1 - uy0);
var total_tile_count = tile_count;
sh_tile_count[local_id.x] = tile_count;
for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) {
workgroupBarrier();
if local_id.x >= (1u << i) {
total_tile_count += sh_tile_count[local_id.x - (1u << i)];
}
workgroupBarrier();
sh_tile_count[local_id.x] = total_tile_count;
}
let tile_count_in = local_id.x + 1u;
if local_id.x == WG_SIZE - 1u {
paths[drawobj_ix].tiles = atomicAdd(&bump.tile, sh_tile_count[WG_SIZE - 1u]);
sh_tile_offset = 1u + atomicAdd(&bump, tile_count_in);
}
// Using storage barriers is a workaround for what appears to be a miscompilation
// when a normal workgroup-shared variable is used to broadcast the value.
storageBarrier();
let tile_offset = paths[drawobj_ix | (WG_SIZE - 1u)].tiles;
storageBarrier();
if drawobj_ix < config.n_drawobj {
let tile_subix = select(0u, sh_tile_count[local_id.x - 1u], local_id.x > 0u);
let bbox = vec4<u32>(ux0, uy0, ux1, uy1);
let path = Path(bbox, tile_offset + tile_subix);
paths[drawobj_ix] = path;
workgroupBarrier();
let tile_offset = sh_tile_offset;
if drawobj_ix < 3u {
paths[drawobj_ix] = tile_offset;
}

// zero allocated memory
// Note: if the number of draw objects is small, utilization will be poor.
// There are two things that can be done to improve that. One would be a
// separate (indirect) dispatch. Another would be to have each workgroup
// process fewer draw objects than the number of threads in the wg.
let total_count = sh_tile_count[WG_SIZE - 1u];
for (var i = local_id.x; i < total_count; i += WG_SIZE) {
// Note: could format output buffer as u32 for even better load
// balancing, as does piet-gpu.
tiles[tile_offset + i] = Tile(0, 0u);
}
}
1 change: 1 addition & 0 deletions piet-wgsl/src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ impl Engine {
label: None,
entries: &entries,
});
println!("{:?}", bind_group_layout);
let compute_pipeline_layout =
device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
Expand Down
21 changes: 4 additions & 17 deletions piet-wgsl/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,28 +69,15 @@ async fn do_render(
queue: &Queue,
engine: &mut Engine,
) -> Result<(), Box<dyn std::error::Error>> {
#[allow(unused)]
let shaders = shaders::init_shaders(device, engine)?;
let full_shaders = shaders::full_shaders(device, engine)?;
let scene = test_scene::gen_test_scene();
//test_scene::dump_scene_info(&scene);
//let (recording, buf) = render::render(&scene, &shaders);
let (recording, buf) = render::render_full(&scene, &full_shaders);
let tile_alloc = shaders::reduced_shader(device, engine);

let (recording, buf) = render::render_reduced(tile_alloc);
let downloads = engine.run_recording(&device, &queue, &recording)?;
let mapped = downloads.map();
device.poll(wgpu::Maintain::Wait);
let buf = mapped.get_mapped(buf).await?;

if false {
dump_buf(bytemuck::cast_slice(&buf));
} else {
let file = File::create("image.png")?;
let w = BufWriter::new(file);
let mut encoder = png::Encoder::new(w, 1024, 1024);
encoder.set_color(png::ColorType::Rgba);
let mut writer = encoder.write_header()?;
writer.write_image_data(&buf)?;
}
dump_buf(bytemuck::cast_slice(&buf));
Ok(())
}

Expand Down
Loading