-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Simplifier fails to optimize for-loop endpoint. #8303
Comments
Funnily enough, in that context the simplifier is smart enough to simplify t78 > 0 to true, because it wouldn't be in the loop if the extent were less than 1, but it currently only simplifies loop_var < loop_max if loop_max is a constant. We could address this with another scoped_truth here for the non-constant case: https://github.com/halide/Halide/blob/main/src/Simplify_Stmts.cpp#L243 |
Okay, so this bit me again. Look at the following: Now instead of |
Well, the simplifier could normalize x + 1 <= y to x < y for ints. But where are these ifs coming from? It might be better to not inject them in the first place. |
This time, something like: output.specialize(output.dim(0).extent() >= 32 && output.dim(1).extent() >= 16)
.gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
.vectorize(xi, 4) // deleting this get's rid of one if like this, but introduces another
.gpu_blocks(c) // block over color-channels
; I wanted to make a specialized version of this pipelines for image sizes that will fit at least one tile. |
I tried this but it didn't repro:
Is there some simple repro you can share? |
I'll construct a repro later (11pm here). How do you check the conceptual stmt file from this? With HL_DEBUG_CODEGEN? |
I have a repro: #include "Halide.h"
#include <iostream>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
Var x("x"), y("y");
Func f("f");
f(x, y) = x * y;
Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
f
.compute_root()
.specialize(f.output_buffer().dim(0).extent() >= 32)
.never_partition_all()
.gpu_tile(x, y, xo, yo, xi, yi, 32, 16, TailStrategy::ShiftInwards)
.vectorize(xi, 4, TailStrategy::ShiftInwards)
;
// We should add this:
//f.compile_to_conceptual_stmt("random_ifs.stmt", {}, StmtOutputFormat::Text, Halide::Target{"host-cuda"});
// But we don't have it yet, so, here is a hack:
Target t{"host-cuda"};
Halide::Module m = Halide::Pipeline(f).compile_to_module({}, "", t);
m.compile({
{ Halide::OutputFileType::conceptual_stmt, "random_ifs.stmt" }
});
printf("Success!\n");
return 0;
} produces assert(reinterpret<uint64>((struct halide_buffer_t *)f.buffer) != (uint64)0, halide_error_buffer_argument_is_null("f"))
let f = (void *)_halide_buffer_get_host((struct halide_buffer_t *)f.buffer)
let f.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)f.buffer)
let f.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)f.buffer)
let f.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 0)
let f.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 0)
let f.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 0)
let f.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)f.buffer, 1)
let f.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)f.buffer, 1)
let f.extent.1.required = f.extent.1 - select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
let f.min.1.required.s = select(f.extent.0 < 32, 0, min(f.extent.1, 16) + -16)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
(struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)f.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)f.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 0, 32, 2, (struct halide_dimension_t *)make_struct(f.min.0, f.extent.0, 1, 0, f.min.1 + f.min.1.required.s, f.extent.1.required, f.extent.0, 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)f.buffer)) {
assert(f.type == (uint32)73728, halide_error_bad_type("Output buffer f", f.type, (uint32)73728))
assert(f.dimensions == 2, halide_error_bad_dimensions("Output buffer f", f.dimensions, 2))
assert(0 <= f.extent.0, halide_error_buffer_extents_negative("Output buffer f", 0, f.extent.0))
assert((0 <= f.min.1.required.s) && ((f.extent.1.required + f.min.1.required.s) <= f.extent.1), let t31 = (f.min.1 + f.min.1.required.s) in halide_error_access_out_of_bounds("Output buffer f", 1, t31, (t31 + f.extent.1.required) + -1, f.min.1, (f.extent.1 + f.min.1) + -1))
assert(0 <= f.extent.1, halide_error_buffer_extents_negative("Output buffer f", 1, f.extent.1))
assert(f.stride.0 == 1, halide_error_constraint_violated("f.stride.0", f.stride.0, "1", 1))
let f.total_extent.1 = int64(f.extent.1)*int64(f.extent.0)
assert((uint64)abs(int64(f.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.0)), (uint64)2147483647))
assert((uint64)abs(int64(f.extent.1)*int64(f.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("f", (uint64)abs(int64(f.extent.1)*int64(f.stride.1)), (uint64)2147483647))
assert(f.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("f", f.total_extent.1, (int64)2147483647))
assert(f != reinterpret<(void *)>((uint64)0), halide_error_host_is_null("Output buffer f"))
produce f {
if (32 <= f.extent.0) {
let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
let t27 = (f.extent.0 + 31)/32
let t26 = (f.extent.1 + 15)/16
gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t26) {
gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t27) {
if (((f.s0.y.yo.block_id_y + 1) <= t26) && ((f.s0.x.xo.block_id_x + 1) <= t27)) {
gpu_thread<CUDA> (.thread_id_y, 0, 16) {
gpu_thread<CUDA> (.thread_id_x, 0, 8) {
let f.s0.y.yi.base.s = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
let f.s0.x.xi.base.s = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
let t25 = (f.min.1 + f.s0.y.yi.base.s) + .thread_id_y
f[ramp((.thread_id_x*4) + ((f.stride.1*t25) + (f.s0.x.xi.base.s - (f.min.1*f.stride.1))), 1, 4)] = ramp(((.thread_id_x*4) + (f.min.0 + f.s0.x.xi.base.s))*t25, t25, 4)
}
}
}
}
}
_halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
} else {
let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
for (f.s0.y.rebased, 0, f.extent.1) {
let t30 = f.s0.y.rebased*f.stride.1
let t29 = f.min.1 + f.s0.y.rebased
for (f.s0.x.rebased, 0, f.extent.0) {
f[f.s0.x.rebased + t30] = (f.min.0 + f.s0.x.rebased)*t29
}
}
_halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
}
}
} UPDATE: Removing the produce f {
if (32 <= f.extent.0) {
let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)f.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
let t18 = (f.extent.0 + 31)/32
let t17 = (f.extent.1 + 15)/16
gpu_block<CUDA> (f.s0.y.yo.block_id_y, 0, t17) {
if ((f.s0.y.yo.block_id_y + 1) <= t17) {
gpu_block<CUDA> (f.s0.x.xo.block_id_x, 0, t18) {
gpu_thread<CUDA> (.thread_id_y, 0, 16) {
gpu_thread<CUDA> (.thread_id_x, 0, 32) {
let f.s0.y.yi.base = min(f.s0.y.yo.block_id_y*16, f.extent.1 + -16)
let f.s0.x.xi.base = min(f.s0.x.xo.block_id_x*32, f.extent.0 + -32)
let t16 = .thread_id_y + f.s0.y.yi.base
f[((f.stride.1*t16) + f.s0.x.xi.base) + .thread_id_x] = (.thread_id_x + f.s0.x.xi.base)*t16
}
}
}
}
}
_halide_buffer_set_device_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
} else {
let halide_copy_to_host_result = halide_copy_to_host((struct halide_buffer_t *)f.buffer)
assert(halide_copy_to_host_result == 0, halide_copy_to_host_result)
for (f.s0.y, 0, f.extent.1) {
let t19 = f.s0.y*f.stride.1
for (f.s0.x, 0, f.extent.0) {
f[f.s0.x + t19] = f.s0.x*f.s0.y
}
}
_halide_buffer_set_host_dirty((struct halide_buffer_t *)f.buffer, (uint1)1)
}
} |
It's a bug. The if statements shouldn't be there. If you specialize and then have more splits on one side of the specialization than the other, lowering incorrectly thinks the Func is part of a compute_with fused group and throws in if statements for safety. |
I have no clue what's going on.
I'm seeing this:
Look at the
if (t84)
and trace thet84
backwards:t84 = loop_var < t78
loop_var
goes from0
tot78
So t84 should always be false. It's the first time I'm seeing something like this.
I went back randomly to d9668c5 to test and compare, but it produces the same result. I don't have time to make a repro right now, but wanted to document this already.
I'm not 100% sure, but I believe LLVM does notice this, and just removes the entire else-branch of this code. (It produces a non-vectorized else-branch).
The text was updated successfully, but these errors were encountered: