Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support copying the overlapping region from one buffer to another. #8463

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions src/runtime/HalideBuffer.h
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just to satisfy the formatter right?

Copy link
Contributor Author

@mcourteaux mcourteaux Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I put a comment on it in the PR. I have clang-format-18, but the build bots still run clang-format-17. Version 18 wants this change for some reason.

Original file line number Diff line number Diff line change
Expand Up @@ -2055,9 +2055,7 @@ class Buffer {
}

template<typename... Args>
HALIDE_ALWAYS_INLINE
storage_T *
address_of(Args... args) const {
HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
if (T_is_void) {
return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
} else {
Expand Down Expand Up @@ -2112,8 +2110,7 @@ class Buffer {
}

HALIDE_ALWAYS_INLINE
const not_void_T &
operator()() const {
const not_void_T &operator()() const {
static_assert(!T_is_void,
"Cannot use operator() on Buffer<void> types");
constexpr int expected_dims = 0;
Expand All @@ -2133,9 +2130,8 @@ class Buffer {

template<typename... Args,
typename = typename std::enable_if<AllInts<Args...>::value>::type>
HALIDE_ALWAYS_INLINE
not_void_T &
operator()(int first, Args... rest) {
HALIDE_ALWAYS_INLINE not_void_T &
operator()(int first, Args... rest) {
static_assert(!T_is_void,
"Cannot use operator() on Buffer<void> types");
constexpr int expected_dims = 1 + (int)(sizeof...(rest));
Expand Down
4 changes: 3 additions & 1 deletion src/runtime/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,9 @@ WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src
}
}

auto result = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host, stream);
auto result = cuda_do_multidimensional_copy(
user_context, c, c.src + c.src_begin, c.dst + c.dst_begin,
dst->dimensions, from_host, to_host, stream);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adds offset for destination as well as source.

Copy link
Contributor Author

@mcourteaux mcourteaux Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes? I don't really know if you have a comment or are just describing.

if (result) {
return result;
}
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/d3d12compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3263,7 +3263,7 @@ WEAK int halide_d3d12compute_buffer_copy(void *user_context, struct halide_buffe
d3d12_buffer *dsrc = peel_buffer(src);
d3d12_buffer *ddst = peel_buffer(dst);
size_t src_offset = dsrc->offsetInBytes + c.src_begin;
size_t dst_offset = ddst->offsetInBytes;
size_t dst_offset = ddst->offsetInBytes + c.dst_begin;
D3D12ContextHolder d3d12_context(user_context, true);
if (d3d12_context.error()) {
return d3d12_context.error();
Expand Down
28 changes: 20 additions & 8 deletions src/runtime/device_buffer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ namespace Internal {
// The struct that describes a host <-> dev copy to perform.
#define MAX_COPY_DIMS 16
struct device_copy {
// opaque handles for source and device memory.
// opaque handles (host or device) for source and destination memory.
uint64_t src, dst;
// The offset in the source memory to start
uint64_t src_begin;
// The offset in the source and destination memory to start
uint64_t src_begin, dst_begin;
// The multidimensional array of contiguous copy tasks that need to be done.
uint64_t extent[MAX_COPY_DIMS];
// The strides (in bytes) that separate adjacent copy tasks in each dimension.
Expand Down Expand Up @@ -70,13 +70,18 @@ WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, in
WEAK void copy_memory(const device_copy &copy, void *user_context) {
// If this is a zero copy buffer, these pointers will be the same.
if (copy.src != copy.dst) {
copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, 0);
copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, copy.dst_begin);
} else {
debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
}
}

// Fills the entire dst buffer, which must be contained within src
// All crops are supported. It copies the maximum amount of pixels from src to dst.
// That maximum number of pixels is determined by the overlapping region of the two
// buffers. This means that you can use it in scenarios:
// 1) Fill the entire dst buffer, when the dst buffer bounds are contained within src.
// 2) Copy the entire src buffer, when the src buffer bounds are contained within dst, to dst.
// 3) Copy only the overlapping region between two buffers, from src to dst.
WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
const halide_buffer_t *dst, bool dst_host) {
// Make a copy job representing copying the first pixel only.
Expand All @@ -90,12 +95,19 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
c.dst_stride_bytes[i] = 0;
}

// Offset the src base pointer to the right point in its buffer.
// Offset the src and dst base pointer to the right point in their buffer.
c.src_begin = 0;
c.dst_begin = 0;
for (int i = 0; i < src->dimensions; i++) {
c.src_begin += (int64_t)src->dim[i].stride * (int64_t)(dst->dim[i].min - src->dim[i].min);
int64_t dim_diff = int64_t(dst->dim[i].min - src->dim[i].min);
if (dim_diff > 0) {
c.src_begin += (int64_t)src->dim[i].stride * dim_diff;
} else {
c.dst_begin += (int64_t)dst->dim[i].stride * (-dim_diff);
}
}
c.src_begin *= c.chunk_size;
c.dst_begin *= c.chunk_size;

if (src->dimensions != dst->dimensions ||
src->type.bytes() != dst->type.bytes() ||
Expand Down Expand Up @@ -134,7 +146,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1];
c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
}
c.extent[insert] = dst->dim[i].extent;
c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent);
// debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
c.dst_stride_bytes[insert] = dst_stride_bytes;
c.src_stride_bytes[insert] = src_stride_bytes;
Expand Down
7 changes: 4 additions & 3 deletions src/runtime/metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ namespace {
void do_device_to_device_copy(void *user_context, mtl_blit_command_encoder *encoder,
const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
if (d == 0) {
buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, c.src_begin + src_offset,
buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, src_offset,
((device_handle *)c.dst)->buf, dst_offset, c.chunk_size);
} else {
// TODO: deal with negative strides. Currently the code in
Expand Down Expand Up @@ -1108,8 +1108,9 @@ WEAK int halide_metal_buffer_copy(void *user_context, struct halide_buffer_t *sr
const char *buffer_label = "halide_metal_buffer_copy";
mtl_command_buffer *blit_command_buffer = new_command_buffer(metal_context.queue, buffer_label, strlen(buffer_label));
mtl_blit_command_encoder *blit_encoder = new_blit_command_encoder(blit_command_buffer);
do_device_to_device_copy(user_context, blit_encoder, c, ((device_handle *)c.src)->offset,
((device_handle *)c.dst)->offset, dst->dimensions);
do_device_to_device_copy(user_context, blit_encoder, c,
((device_handle *)c.src)->offset + c.src_begin,
((device_handle *)c.dst)->offset + c.dst_begin, dst->dimensions);
end_encoding(blit_encoder);
commit_command_buffer(blit_command_buffer);
} else {
Expand Down
11 changes: 9 additions & 2 deletions src/runtime/opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
size_t max_work_item_sizes[4] = {
0,
};
cl_uint mem_base_addr_align = 0;

struct {
void *dst;
Expand All @@ -521,6 +522,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
{&max_work_group_size, sizeof(max_work_group_size), CL_DEVICE_MAX_WORK_GROUP_SIZE},
{&max_work_item_dimensions, sizeof(max_work_item_dimensions), CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS},
{&max_work_item_sizes[0], sizeof(max_work_item_sizes), CL_DEVICE_MAX_WORK_ITEM_SIZES},
{&mem_base_addr_align, sizeof(mem_base_addr_align), CL_DEVICE_MEM_BASE_ADDR_ALIGN},
{nullptr}};

// Do all the queries.
Expand All @@ -544,7 +546,8 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
<< " max work item sizes: " << (uint64_t)max_work_item_sizes[0]
<< "x" << (uint64_t)max_work_item_sizes[1]
<< "x" << (uint64_t)max_work_item_sizes[2]
<< "x" << (uint64_t)max_work_item_sizes[3] << "\n";
<< "x" << (uint64_t)max_work_item_sizes[3] << "\n"
<< " mem base addr align: " << mem_base_addr_align << "\n";
#endif

// Create context and command queue.
Expand Down Expand Up @@ -1035,7 +1038,9 @@ WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *s
}
#endif

auto result = opencl_do_multidimensional_copy(user_context, ctx, c, c.src_begin, 0, dst->dimensions, from_host, to_host);
auto result = opencl_do_multidimensional_copy(
user_context, ctx, c, c.src_begin, c.dst_begin,
dst->dimensions, from_host, to_host);
if (result) {
return result;
}
Expand Down Expand Up @@ -1155,6 +1160,8 @@ WEAK int halide_opencl_run(void *user_context,
// span the crop.
mem = clCreateSubBuffer(mem, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
sub_buffers[sub_buffers_saved++] = mem;
debug(user_context) << "Create subbuffer " << (void *)mem << ": "
<< "offset=" << region.origin << ", size=" << region.size << "\n";
}
if (err == CL_SUCCESS) {
debug(user_context) << "Mapped dev handle is: " << (void *)mem << "\n";
Expand Down
10 changes: 3 additions & 7 deletions src/runtime/vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
copy_helper.src = (uint64_t)(staging_buffer);
copy_helper.dst = (uint64_t)(device_buffer);
uint64_t src_offset = copy_helper.src_begin;
uint64_t dst_offset = device_region->range.head_offset;
uint64_t dst_offset = copy_helper.dst_begin + device_region->range.head_offset;

// enqueue the copy operation, using the allocated buffers
error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
Expand Down Expand Up @@ -639,7 +639,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
copy_helper.src = (uint64_t)(device_buffer);
copy_helper.dst = (uint64_t)(staging_buffer);
uint64_t src_offset = copy_helper.src_begin + device_region->range.head_offset;
uint64_t dst_offset = 0;
uint64_t dst_offset = copy_helper.dst_begin;

// enqueue the copy operation, using the allocated buffers
error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
Expand Down Expand Up @@ -918,11 +918,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
copy_helper.src = (uint64_t)(src_device_buffer);
copy_helper.dst = (uint64_t)(dst_device_buffer);
uint64_t src_offset = copy_helper.src_begin + src_buffer_region->range.head_offset;
uint64_t dst_offset = dst_buffer_region->range.head_offset;
if (!from_host && !to_host) {
src_offset = src_buffer_region->range.head_offset;
dst_offset = dst_buffer_region->range.head_offset;
}
uint64_t dst_offset = copy_helper.dst_begin + dst_buffer_region->range.head_offset;

debug(user_context) << " src region=" << (void *)src_memory_region << " buffer=" << (void *)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n";
debug(user_context) << " dst region=" << (void *)dst_memory_region << " buffer=" << (void *)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
Expand Down
6 changes: 3 additions & 3 deletions src/runtime/vulkan_resources.h
Original file line number Diff line number Diff line change
Expand Up @@ -1548,9 +1548,9 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
(!from_host && !to_host)) {

VkBufferCopy buffer_copy = {
c.src_begin + src_offset, // srcOffset
dst_offset, // dstOffset
c.chunk_size // size
src_offset, // srcOffset
dst_offset, // dstOffset
c.chunk_size // size
};

VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ WEAK int halide_webgpu_buffer_copy(void *user_context,
ErrorScope error_scope(user_context, context.device);

err = do_multidimensional_copy(user_context, &context, c,
c.src_begin, 0, dst->dimensions,
c.src_begin, c.dst_begin, dst->dimensions,
from_host, to_host);
if (err == halide_error_code_success) {
err = error_scope.wait();
Expand Down
79 changes: 70 additions & 9 deletions test/correctness/device_buffer_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ int main(int argc, char **argv) {

for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
bool in_gpu3 = (i >= gpu_buf2.dim(0).min()) &&
(i < (gpu_buf2.dim(0).min() + gpu_buf2.dim(0).extent())) &&
bool in_gpu2 = (i >= gpu_buf2.dim(0).min()) &&
(i <= gpu_buf2.dim(0).max()) &&
(j >= gpu_buf2.dim(1).min()) &&
(j < (gpu_buf2.dim(1).min() + gpu_buf2.dim(1).extent()));
assert(gpu_buf1(i, j) == (in_gpu3 ? 0 : (i + j * 256)));
(j <= gpu_buf2.dim(1).max());
assert(gpu_buf1(i, j) == (in_gpu2 ? 0 : (i + j * 256)));
}
}
}
Expand All @@ -121,15 +121,15 @@ int main(int argc, char **argv) {
for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
bool in_cpu1 = (i >= cpu_buf1.dim(0).min()) &&
(i < (cpu_buf1.dim(0).min() + cpu_buf1.dim(0).extent())) &&
(i <= cpu_buf1.dim(0).max()) &&
(j >= cpu_buf1.dim(1).min()) &&
(j < (cpu_buf1.dim(1).min() + cpu_buf1.dim(1).extent()));
(j <= cpu_buf1.dim(1).max());
assert(cpu_buf(i, j) == (in_cpu1 ? (i + j * 256) : 0));
}
}
}

printf("Test copy device to device -- subset area.\n");
printf("Test copy device to device -- subset area (from bigger to smaller buffer).\n");
{
Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
assert(gpu_buf1.raw_buffer()->device_interface != nullptr);
Expand All @@ -147,14 +147,75 @@ int main(int argc, char **argv) {
for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) &&
(i < (gpu_buf3.dim(0).min() + gpu_buf3.dim(0).extent())) &&
(i <= gpu_buf3.dim(0).max()) &&
(j >= gpu_buf3.dim(1).min()) &&
(j < (gpu_buf3.dim(1).min() + gpu_buf3.dim(1).extent()));
(j <= gpu_buf3.dim(1).max());
assert(gpu_buf2(i, j) == (i + j * 256 + (in_gpu3 ? 0 : 256000)));
}
}
}

printf("Test copy device to device -- subset area (from smaller to bigger buffer).\n");
{
Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
assert(gpu_buf1.raw_buffer()->device_interface != nullptr);

Halide::Runtime::Buffer<int32_t> gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000);
assert(gpu_buf2.raw_buffer()->device_interface != nullptr);

Halide::Runtime::Buffer<int32_t> gpu_buf3 = gpu_buf2.cropped({{32, 64}, {32, 64}});
assert(gpu_buf3.raw_buffer()->device_interface != nullptr);

assert(gpu_buf3.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf3, gpu_buf1.raw_buffer()->device_interface, gpu_buf1) == 0);
gpu_buf1.set_device_dirty();
gpu_buf1.copy_to_host();

for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) &&
(i <= gpu_buf3.dim(0).max()) &&
(j >= gpu_buf3.dim(1).min()) &&
(j <= gpu_buf3.dim(1).max());
assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu3 ? 256000 : 0)));
}
}
}

printf("Test copy device to device -- subset area (from and to not-contained-within-each-other crops).\n");
{
Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
assert(gpu_buf1.raw_buffer()->device_interface != nullptr);

Halide::Runtime::Buffer<int32_t> gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000);
assert(gpu_buf2.raw_buffer()->device_interface != nullptr);

// Two crops: one horizontal, and one vertical rectangle.
Halide::Runtime::Buffer<int32_t> gpu_buf1_crop = gpu_buf1.cropped({{32, 64}, {32, 16}});
Halide::Runtime::Buffer<int32_t> gpu_buf2_crop = gpu_buf2.cropped({{32, 16}, {32, 64}});
assert(gpu_buf1_crop.raw_buffer()->device_interface != nullptr);
assert(gpu_buf2_crop.raw_buffer()->device_interface != nullptr);

// Copy from the crop to another crop.
assert(gpu_buf2_crop.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf2_crop, gpu_buf1_crop.raw_buffer()->device_interface, gpu_buf1_crop) == 0);
gpu_buf1.set_device_dirty();
gpu_buf1.copy_to_host();

for (int i = 0; i < 128; i++) {
for (int j = 0; j < 128; j++) {
bool in_gpu1_crop = (i >= gpu_buf1_crop.dim(0).min()) &&
(i <= gpu_buf1_crop.dim(0).max()) &&
(j >= gpu_buf1_crop.dim(1).min()) &&
(j <= gpu_buf1_crop.dim(1).max());
bool in_gpu2_crop = (i >= gpu_buf2_crop.dim(0).min()) &&
(i <= gpu_buf2_crop.dim(0).max()) &&
(j >= gpu_buf2_crop.dim(1).min()) &&
(j <= gpu_buf2_crop.dim(1).max());
// printf("gpu_buf1(%d, %d) = %d (in_gpu1_crop=%d in_gpu2_crop=%d)\n", i, j, gpu_buf1(i, j), in_gpu1_crop, in_gpu2_crop);
assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu1_crop && in_gpu2_crop ? 256000 : 0)));
}
}
}

printf("Test copy from device no src host.\n");
{
Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
Expand Down
Loading