From 0e27df45213e51d8e2e68f43dcbb5d5b32958d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 5 Aug 2024 00:14:20 -0400 Subject: [PATCH] radeonsi/gfx12: fix VS output corruption with streamout We increased VS_EXPORT_COUNT to 8 for streamout in gfx10_shader_ngg, but we forgot to increase the attribute ring stride, causing all waves except the first one to get corrupted VS outputs. Fixes: f703dfd1bb8 - radeonsi: add gfx12 Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_nir_lower_abi.c | 2 +- src/gallium/drivers/radeonsi/si_shader.h | 1 + .../drivers/radeonsi/si_state_shaders.cpp | 31 +++++++++++++------ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 99b9c4fd930..01bf5351d52 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -43,7 +43,7 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader, sel->info.base.vs.blit_sgprs_amd - 1) : ac_nir_load_arg(b, &args->ac, args->gs_attr_address); - unsigned stride = 16 * shader->info.nr_param_exports; + unsigned stride = 16 * si_shader_num_alloc_param_exports(shader); uint32_t desc[4]; ac_build_attr_ring_descriptor(sel->screen->info.gfx_level, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 891015afe08..634c2189497 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -1028,6 +1028,7 @@ void si_nir_late_opts(struct nir_shader *nir); char *si_finalize_nir(struct pipe_screen *screen, void *nirptr); /* si_state_shaders.cpp */ +unsigned si_shader_num_alloc_param_exports(struct si_shader *shader); unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader); void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, struct gfx9_gs_info *out); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 108cce44489..005cbc5541e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1391,6 +1391,26 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel, shader->info.nr_pos_exports > 1)); } +/* Return the number of allocated param exports. This can be more than the number of param + * exports in the shader. + */ +unsigned si_shader_num_alloc_param_exports(struct si_shader *shader) +{ + unsigned num_params = shader->info.nr_param_exports; + + /* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs on GFX12, they can wrap + * around if there are more than 2^12 workgroups, causing 2 workgroups to get the same + * ordered ID, which can deadlock the "ordered add" loop. + * + * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring to limit + * the number of workgroups in flight and thus the number of ordered IDs in flight. + */ + if (shader->selector->screen->info.gfx_level >= GFX12 && si_shader_uses_streamout(shader)) + num_params = MAX2(num_params, 8); + + return num_params; +} + /** * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader * in NGG mode. @@ -1541,16 +1561,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader gs_sel->info.writes_primid); if (sscreen->info.gfx_level >= GFX12) { - unsigned num_params = shader->info.nr_param_exports; - - /* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs, they can wrap - * around if there are more than 2^12 workgroups, causing 2 workgroups to get the same - * ordered ID, which would break the streamout algorithm. - * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring, - * which is enough to limit the range of ordered IDs that can be in flight. - */ - if (si_shader_uses_streamout(shader)) - num_params = MAX2(num_params, 8); + unsigned num_params = si_shader_num_alloc_param_exports(shader); shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) | S_00B220_GLG_FORCE_DISABLE(1) |